freebsd-dev/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
Martin Matuska 8fc257994d Merge ZFS version 15 and almost all OpenSolaris bugfixes referenced
in Solaris 10 updates 141445-09 and 142901-14.

Detailed information:
(OpenSolaris revisions and Bug IDs, Solaris 10 patch numbers)

7844:effed23820ae
6755435	zfs_open() and zfs_close() needs to use ZFS_ENTER/ZFS_VERIFY_ZP (141445-01)

7897:e520d8258820
6748436	inconsistent zpool.cache in boot_archive could panic a zfs root filesystem upon boot-up (141445-01)

7965:b795da521357
6740164	zpool attach can create an illegal root pool (141909-02)

8084:b811cc60d650
6769612	zpool_import() will continue to write to cachefile even if altroot is set (N/A)

8121:7fd09d4ebd9c
6757430	want an option for zdb to disable space map loading and leak tracking (141445-01)

8129:e4f45a0bfbb0
6542860	ASSERT: reason != VDEV_LABEL_REMOVE||vdev_inuse(vd, crtxg, reason, 0) (141445-01)

8188:fd00c0a81e80
6761100	want zdb option to select older uberblocks (141445-01)

8190:6eeea43ced42
6774886	zfs_setattr() won't allow ndmp to restore SUNWattr_rw (141445-01)

8225:59a9961c2aeb
6737463	panic while trying to write out config file if root pool import fails (141445-01)

8227:f7d7be9b1f56
6765294	Refactor replay (141445-01)

8228:51e9ca9ee3a5
6572357	libzfs should do more to avoid mnttab lookups (141909-01)
6572376	zfs_iter_filesystems and zfs_iter_snapshots get objset stats twice (141909-01)

8241:5a60f16123ba
6328632	zpool offline is a bit too conservative (141445-01)
6739487	ASSERT: txg <= spa_final_txg due to scrub/export race (141445-01)
6767129	ASSERT: cvd->vdev_isspare, in spa_vdev_detach() (141445-01)
6747698	checksum failures after offline -t / export / import / scrub (141445-01)
6745863	ZFS writes to disk after it has been offlined (141445-01)
6722540	50% slowdown on scrub/resilver with certain vdev configurations (141445-01)
6759999	resilver logic rewrites ditto blocks on both source and destination (141445-01)
6758107	I/O should never suspend during spa_load() (141445-01)
6776548	codereview(1) runs off the page when faced with multi-line comments (N/A)
6761406	AMD errata 91 workaround doesn't work on 64-bit systems (141445-01)

8242:e46e4b2f0a03
6770866	GRUB/ZFS should require physical path or devid, but not both (141445-01)

8269:03a7e9050cfd
6674216	"zfs share" doesn't work, but "zfs set sharenfs=on" does (141445-01)
6621164	$SRC/cmd/zfs/zfs_main.c seems to have a syntax error in the translation note (141445-01)
6635482	i18n problems in libzfs_dataset.c and zfs_main.c (141445-01)
6595194	"zfs get" VALUE column is as wide as NAME (141445-01)
6722991	vdev_disk.c: error checking for ddi_pathname_to_dev_t() must test for NODEV (141445-01)
6396518	ASSERT strings shouldn't be pre-processed (141445-01)

8274:846b39508aff
6713916	scrub/resilver needlessly decompress data (141445-01)

8343:655db2375fed
6739553	libzfs_status msgid table is out of sync (141445-01)
6784104	libzfs unfairly rejects numerical values greater than 2^63 (141445-01)
6784108	zfs_realloc() should not free original memory on failure (141445-01)

8525:e0e0e525d0f8
6788830	set large value to reservation cause core dump (141445-01)
6791064	want sysevents for ZFS scrub (141445-01)
6791066	need to be able to set cachefile on faulted pools (141445-01)
6791071	zpool_do_import() should not enable datasets on faulted pools (141445-01)
6792134	getting multiple properties on a faulted pool leads to confusion (141445-01)

8547:bcc7b46e5ff7
6792884	Vista clients cannot access .zfs (141445-01)

8632:36ef517870a3
6798384	It can take a village to raise a zio (141445-01)

8636:7e4ce9158df3
6551866	deadlock between zfs_write(), zfs_freesp(), and zfs_putapage() (141909-01)
6504953	zfs_getpage() misunderstands VOP_GETPAGE() interface (141909-01)
6702206	ZFS read/writer lock contention throttles sendfile() benchmark (141445-01)
6780491	Zone on a ZFS filesystem has poor fork/exec performance (141445-01)
6747596	assertion failed: DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), BP_IDENTITY(zio->io_bp))); (141445-01)

8692:692d4668b40d
6801507	ZFS read aggregation should not mind the gap (141445-01)

8697:e62d2612c14d
6633095	creating a filesystem with many properties set is slow (141445-01)

8768:dfecfdbb27ed
6775697	oracle crashes when overwriting after hitting quota on zfs (141909-01)

8811:f8deccf701cf
6790687	libzfs mnttab caching ignores external changes (141445-01)
6791101	memory leak from libzfs_mnttab_init (141445-01)

8845:91af0d9c0790
6800942	smb_session_create() incorrectly stores IP addresses (N/A)
6582163	Access Control List (ACL) for shares (141445-01)
6804954	smb_search - shortname field should be space padded following the NULL terminator (N/A)
6800184	Panic at smb_oplock_conflict+0x35() (N/A)

8876:59d2e67b4b65
6803822	Reboot after replacement of system disk in a ZFS mirror drops to grub> prompt (141445-01)

8924:5af812f84759
6789318	coredump when issue zdb -uuuu poolname/ (141445-01)
6790345 zdb -dddd -e poolname coredump (141445-01)
6797109 zdb: 'zdb -dddddd pool_name/fs_name inode' coredump if the file with inode was deleted (141445-01)
6797118 zdb: 'zdb -dddddd poolname inum' coredump if I miss the fs name (141445-01)
6803343 shareiscsi=on failed, iscsitgtd failed request to share (141445-01)

9030:243fd360d81f
6815893	hang mounting a dataset after booting into a new boot environment (141445-01)

9056:826e1858a846
6809691	'zpool create -f' no longer overwrites ufs infomation (141445-01)

9179:d8fbd96b79b3
6790064	zfs needs to determine uid and gid earlier in create process (141445-01)

9214:8d350e5d04aa
6604992	forced unmount + being in .zfs/snapshot/<snap1> = not happy (141909-01)
6810367	assertion failed: dvp->v_flag & VROOT, file: ../../common/fs/gfs.c, line: 426 (141909-01)

9229:e3f8b41e5db4
6807765	ztest_dsl_dataset_promote_busy needs to clean up after ENOSPC (141445-01)

9230:e4561e3eb1ef
6821169	offlining a device results in checksum errors (141445-01)
6821170	ZFS should not increment error stats for unavailable devices (141445-01)
6824006	need to increase issue and interrupt taskqs threads in zfs (141445-01)

9234:bffdc4fc05c4
6792139	recovering from a suspended pool needs some work (141445-01)
6794830	reboot command hangs on a failed zfs pool (141445-01)

9246:67c03c93c071
6824062	System panicked in zfs_mount due to NULL pointer dereference when running btts and svvs tests (141909-01)

9276:a8a7fc849933
6816124	System crash running zpool destroy on broken zpool (141445-03)

9355:09928982c591
6818183	zfs snapshot -r is slow due to set_snap_props() doing txg_wait_synced() for each new snapshot (141445-03)

9391:413d0661ef33
6710376	log device can show incorrect status when other parts of pool are degraded (141445-03)

9396:f41cf682d0d3 (part already merged)
6501037	want user/group quotas on ZFS (141445-03)
6827260	assertion failed in arc_read(): hdr == pbuf->b_hdr (141445-03)
6815592	panic: No such hold X on refcount Y from zfs_znode_move (141445-03)
6759986	zfs list shows temporary %clone when doing online zfs recv (141445-03)

9404:319573cd93f8
6774713	zfs ignores canmount=noauto when sharenfs property != off (141445-03)

9412:4aefd8704ce0
6717022	ZFS DMU needs zero-copy support (141445-03)

9425:e7ffacaec3a8
6799895	spa_add_spares() needs to be protected by config lock (141445-03)
6826466	want to post sysevents on hot spare activation (141445-03)
6826468	spa 'allowfaulted' needs some work (141445-03)
6826469	kernel support for storing vdev FRU information (141445-03)
6826470	skip posting checksum errors from DTL regions of leaf vdevs (141445-03)
6826471	I/O errors after device remove probe can confuse FMA (141445-03)
6826472	spares should enjoy some of the benefits of cache devices (141445-03)

9443:2a96d8478e95
6833711	gang leaders shouldn't have to be logical (141445-03)

9463:d0bd231c7518
6764124	want zdb to be able to checksum metadata blocks only (141445-03)

9465:8372081b8019
6830237	zfs panic in zfs_groupmember() (141445-03)

9466:1fdfd1fed9c4
6833162	phantom log device in zpool status (141445-03)

9469:4f68f041ddcd
6824968	add ZFS userquota support to rquotad (141445-03)

9470:6d827468d7b5
6834217	godfather I/O should reexecute (141445-03)

9480:fcff33da767f
6596237	Stop looking and start ganging (141909-02)

9493:9933d599bc93
6623978	lwb->lwb_buf != NULL, file ../../../uts/common/fs/zfs/zil.c, line 787, function zil_lwb_commit (141445-06)

9512:64cafcbcc337
6801810	Commit of aligned streaming rewrites to ZIL device causes unwanted disk reads (N/A)

9515:d3b739d9d043
6586537	async zio taskqs can block out userland commands (142901-09)

9554:787363635b6a
6836768	zfs_userspace() callback has no way to indicate failure (N/A)

9574:1eb6a6ab2c57
6838062	zfs panics when an error is encountered in space_map_load() (141909-02)

9583:b0696cd037cc
6794136	Panic BAD TRAP: type=e when importing degraded zraid pool. (141909-03)

9630:e25a03f552e0
6776104	"zfs import" deadlock between spa_unload() and spa_async_thread() (141445-06)

9653:a70048a304d1
6664765	Unable to remove files when using fat-zap and quota exceeded on ZFS filesystem (141445-06)

9688:127be1845343
6841321	zfs userspace / zfs get userused@ doesn't work on mounted snapshot (N/A)
6843069	zfs get userused@S-1-... doesn't work (N/A)

9873:8ddc892eca6e
6847229	assertion failed: refcount_count(&tx->tx_space_written) + delta <= tx->tx_space_towrite in dmu_tx.c (141445-06)

9904:d260bd3fd47c
6838344	kernel heap corruption detected on zil while stress testing (141445-06)

9951:a4895b3dd543
6844900	zfs_ioc_userspace_upgrade leaks (N/A)

10040:38b25aeeaf7a
6857012	zfs panics on zpool import (141445-06)

10000:241a51d8720c
6848242	zdb -e no longer works as expected (N/A)

10100:4a6965f6bef8
6856634	snv_117 not booting: zfs_parse_bootfs: error2 (141445-07)

10160:a45b03783d44
6861983	zfs should use new name <-> SID interfaces (N/A)
6862984	userquota commands can hang (141445-06)

10299:80845694147f
6696858	zfs receive of incremental replication stream can dereference NULL pointer and crash (N/A)

10302:a9e3d1987706
6696858	zfs receive of incremental replication stream can dereference NULL pointer and crash (fix lint) (N/A)

10575:2a8816c5173b (partial merge)
6882227 spa_async_remove() shouldn't do a full clear (142901-14)

10800:469478b180d9
6880764	fsync on zfs is broken if writes are greater than 32kb on a hard crash and no log attached (142901-09)
6793430 zdb -ivvvv assertion failure: bp->blk_cksum.zc_word[2] == dmu_objset_id(zilog->zl_os) (N/A)

10801:e0bf032e8673 (partial merge)
6822816 assertion failed: zap_remove_int(ds_next_clones_obj) returns ENOENT (142901-09)

10810:b6b161a6ae4a
6892298 buf->b_hdr->b_state != arc_anon, file: ../../common/fs/zfs/arc.c, line: 2849 (142901-09)

10890:499786962772
6807339	spurious checksum errors when replacing a vdev (142901-13)

11249:6c30f7dfc97b
6906110 bad trap panic in zil_replay_log_record (142901-13)
6906946 zfs replay isn't handling uid/gid correctly (142901-13)

11454:6e69bacc1a5a
6898245 suspended zpool should not cause rest of the zfs/zpool commands to hang (142901-10)

11546:42ea6be8961b (partial merge)
6833999 3-way deadlock in dsl_dataset_hold_ref() and dsl_sync_task_group_sync() (142901-09)

Discussed with:	pjd
Approved by:	delphij (mentor)
Obtained from:	OpenSolaris (multiple Bug IDs)
MFC after:	2 months
2010-07-12 23:49:04 +00:00

1193 lines
29 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* This file contains the top half of the zfs directory structure
* implementation. The bottom half is in zap_leaf.c.
*
* The zdir is an extendable hash data structure. There is a table of
* pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
* each a constant size and hold a variable number of directory entries.
* The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
*
* The pointer table holds a power of 2 number of pointers.
* (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
* by the pointer at index i in the table holds entries whose hash value
* has a zd_prefix_len - bit prefix
*/
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#include <sys/zap.h>
#include <sys/refcount.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
#include <sys/zfs_znode.h>
int fzap_default_block_shift = 14; /* 16k blocksize */
static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
void
fzap_byteswap(void *vbuf, size_t size)
{
uint64_t block_type;
block_type = *(uint64_t *)vbuf;
if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
zap_leaf_byteswap(vbuf, size);
else {
/* it's a ptrtbl block */
byteswap_uint64_array(vbuf, size);
}
}
void
fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
{
dmu_buf_t *db;
zap_leaf_t *l;
int i;
zap_phys_t *zp;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
zap->zap_ismicro = FALSE;
(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
&zap->zap_f.zap_phys, zap_evict);
mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0);
zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
zp = zap->zap_f.zap_phys;
/*
* explicitly zero it since it might be coming from an
* initialized microzap
*/
bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
zp->zap_block_type = ZBT_HEADER;
zp->zap_magic = ZAP_MAGIC;
zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
zp->zap_freeblk = 2; /* block 1 will be the first leaf */
zp->zap_num_leafs = 1;
zp->zap_num_entries = 0;
zp->zap_salt = zap->zap_salt;
zp->zap_normflags = zap->zap_normflags;
/* block 1 will be the first leaf */
for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
/*
* set up block 1 - the first leaf
*/
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
dmu_buf_will_dirty(db, tx);
l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
l->l_dbuf = db;
l->l_phys = db->db_data;
zap_leaf_init(l, zp->zap_normflags != 0);
kmem_free(l, sizeof (zap_leaf_t));
dmu_buf_rele(db, FTAG);
}
static int
zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
{
if (RW_WRITE_HELD(&zap->zap_rwlock))
return (1);
if (rw_tryupgrade(&zap->zap_rwlock)) {
dmu_buf_will_dirty(zap->zap_dbuf, tx);
return (1);
}
return (0);
}
/*
* Generic routines for dealing with the pointer & cookie tables.
*/
static int
zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
dmu_tx_t *tx)
{
uint64_t b, newblk;
dmu_buf_t *db_old, *db_new;
int err;
int bs = FZAP_BLOCK_SHIFT(zap);
int hepb = 1<<(bs-4);
/* hepb = half the number of entries in a block */
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
ASSERT(tbl->zt_blk != 0);
ASSERT(tbl->zt_numblks > 0);
if (tbl->zt_nextblk != 0) {
newblk = tbl->zt_nextblk;
} else {
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk;
ASSERT3U(tbl->zt_blks_copied, ==, 0);
dmu_prefetch(zap->zap_objset, zap->zap_object,
tbl->zt_blk << bs, tbl->zt_numblks << bs);
}
/*
* Copy the ptrtbl from the old to new location.
*/
b = tbl->zt_blks_copied;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_blk + b) << bs, FTAG, &db_old);
if (err)
return (err);
/* first half of entries in old[b] go to new[2*b+0] */
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
(newblk + 2*b+0) << bs, FTAG, &db_new));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);
/* second half of entries in old[b] go to new[2*b+1] */
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
(newblk + 2*b+1) << bs, FTAG, &db_new));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);
dmu_buf_rele(db_old, FTAG);
tbl->zt_blks_copied++;
dprintf("copied block %llu of %llu\n",
tbl->zt_blks_copied, tbl->zt_numblks);
if (tbl->zt_blks_copied == tbl->zt_numblks) {
(void) dmu_free_range(zap->zap_objset, zap->zap_object,
tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
tbl->zt_blk = newblk;
tbl->zt_numblks *= 2;
tbl->zt_shift++;
tbl->zt_nextblk = 0;
tbl->zt_blks_copied = 0;
dprintf("finished; numblocks now %llu (%lluk entries)\n",
tbl->zt_numblks, 1<<(tbl->zt_shift-10));
}
return (0);
}
static int
zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
dmu_tx_t *tx)
{
int err;
uint64_t blk, off;
int bs = FZAP_BLOCK_SHIFT(zap);
dmu_buf_t *db;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT(tbl->zt_blk != 0);
dprintf("storing %llx at index %llx\n", val, idx);
blk = idx >> (bs-3);
off = idx & ((1<<(bs-3))-1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_blk + blk) << bs, FTAG, &db);
if (err)
return (err);
dmu_buf_will_dirty(db, tx);
if (tbl->zt_nextblk != 0) {
uint64_t idx2 = idx * 2;
uint64_t blk2 = idx2 >> (bs-3);
uint64_t off2 = idx2 & ((1<<(bs-3))-1);
dmu_buf_t *db2;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
if (err) {
dmu_buf_rele(db, FTAG);
return (err);
}
dmu_buf_will_dirty(db2, tx);
((uint64_t *)db2->db_data)[off2] = val;
((uint64_t *)db2->db_data)[off2+1] = val;
dmu_buf_rele(db2, FTAG);
}
((uint64_t *)db->db_data)[off] = val;
dmu_buf_rele(db, FTAG);
return (0);
}
static int
zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
{
uint64_t blk, off;
int err;
dmu_buf_t *db;
int bs = FZAP_BLOCK_SHIFT(zap);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
blk = idx >> (bs-3);
off = idx & ((1<<(bs-3))-1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_blk + blk) << bs, FTAG, &db);
if (err)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
dmu_buf_rele(db, FTAG);
if (tbl->zt_nextblk != 0) {
/*
* read the nextblk for the sake of i/o error checking,
* so that zap_table_load() will catch errors for
* zap_table_store.
*/
blk = (idx*2) >> (bs-3);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_nextblk + blk) << bs, FTAG, &db);
dmu_buf_rele(db, FTAG);
}
return (err);
}
/*
* Routines for growing the ptrtbl.
*/
static void
zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
{
int i;
for (i = 0; i < n; i++) {
uint64_t lb = src[i];
dst[2*i+0] = lb;
dst[2*i+1] = lb;
}
}
static int
zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
{
/* In case things go horribly wrong. */
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
return (ENOSPC);
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
/*
* We are outgrowing the "embedded" ptrtbl (the one
* stored in the header block). Give it its own entire
* block, which will double the size of the ptrtbl.
*/
uint64_t newblk;
dmu_buf_t *db_new;
int err;
ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
newblk = zap_allocate_blocks(zap, 1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
if (err)
return (err);
dmu_buf_will_dirty(db_new, tx);
zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
dmu_buf_rele(db_new, FTAG);
zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
(FZAP_BLOCK_SHIFT(zap)-3));
return (0);
} else {
return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
zap_ptrtbl_transfer, tx));
}
}
static void
zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
{
dmu_buf_will_dirty(zap->zap_dbuf, tx);
mutex_enter(&zap->zap_f.zap_num_entries_mtx);
ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
zap->zap_f.zap_phys->zap_num_entries += delta;
mutex_exit(&zap->zap_f.zap_num_entries_mtx);
}
static uint64_t
zap_allocate_blocks(zap_t *zap, int nblocks)
{
uint64_t newblk;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
newblk = zap->zap_f.zap_phys->zap_freeblk;
zap->zap_f.zap_phys->zap_freeblk += nblocks;
return (newblk);
}
static zap_leaf_t *
zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
{
void *winner;
zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = zap_allocate_blocks(zap, 1);
l->l_dbuf = NULL;
l->l_phys = NULL;
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
ASSERT(winner == NULL);
dmu_buf_will_dirty(l->l_dbuf, tx);
zap_leaf_init(l, zap->zap_normflags != 0);
zap->zap_f.zap_phys->zap_num_leafs++;
return (l);
}
int
fzap_count(zap_t *zap, uint64_t *count)
{
ASSERT(!zap->zap_ismicro);
mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
*count = zap->zap_f.zap_phys->zap_num_entries;
mutex_exit(&zap->zap_f.zap_num_entries_mtx);
return (0);
}
/*
* Routines for obtaining zap_leaf_t's
*/
void
zap_put_leaf(zap_leaf_t *l)
{
rw_exit(&l->l_rwlock);
dmu_buf_rele(l->l_dbuf, NULL);
}
_NOTE(ARGSUSED(0))
static void
zap_leaf_pageout(dmu_buf_t *db, void *vl)
{
zap_leaf_t *l = vl;
rw_destroy(&l->l_rwlock);
kmem_free(l, sizeof (zap_leaf_t));
}
static zap_leaf_t *
zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
{
zap_leaf_t *l, *winner;
ASSERT(blkid != 0);
l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = blkid;
l->l_bs = highbit(db->db_size)-1;
l->l_dbuf = db;
l->l_phys = NULL;
winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
rw_exit(&l->l_rwlock);
if (winner != NULL) {
/* someone else set it first */
zap_leaf_pageout(NULL, l);
l = winner;
}
/*
* lhr_pad was previously used for the next leaf in the leaf
* chain. There should be no chained leafs (as we have removed
* support for them).
*/
ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0);
/*
* There should be more hash entries than there can be
* chunks to put in the hash table
*/
ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
/* The chunks should begin at the end of the hash table */
ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
&l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
/* The chunks should end at the end of the block */
ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
(uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
return (l);
}
static int
zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
zap_leaf_t **lp)
{
dmu_buf_t *db;
zap_leaf_t *l;
int bs = FZAP_BLOCK_SHIFT(zap);
int err;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
blkid << bs, NULL, &db);
if (err)
return (err);
ASSERT3U(db->db_object, ==, zap->zap_object);
ASSERT3U(db->db_offset, ==, blkid << bs);
ASSERT3U(db->db_size, ==, 1 << bs);
ASSERT(blkid != 0);
l = dmu_buf_get_user(db);
if (l == NULL)
l = zap_open_leaf(blkid, db);
rw_enter(&l->l_rwlock, lt);
/*
* Must lock before dirtying, otherwise l->l_phys could change,
* causing ASSERT below to fail.
*/
if (lt == RW_WRITER)
dmu_buf_will_dirty(db, tx);
ASSERT3U(l->l_blkid, ==, blkid);
ASSERT3P(l->l_dbuf, ==, db);
ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
*lp = l;
return (0);
}
static int
zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
{
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
ASSERT3U(idx, <,
(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
return (0);
} else {
return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
idx, valp));
}
}
static int
zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
{
ASSERT(tx != NULL);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
return (0);
} else {
return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
idx, blk, tx));
}
}
static int
zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{
uint64_t idx, blk;
int err;
ASSERT(zap->zap_dbuf == NULL ||
zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
err = zap_idx_to_blk(zap, idx, &blk);
if (err != 0)
return (err);
err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
(*lp)->l_phys->l_hdr.lh_prefix);
return (err);
}
static int
zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
{
zap_t *zap = zn->zn_zap;
uint64_t hash = zn->zn_hash;
zap_leaf_t *nl;
int prefix_diff, i, err;
uint64_t sibling;
int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
l->l_phys->l_hdr.lh_prefix);
if (zap_tryupgradedir(zap, tx) == 0 ||
old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
/* We failed to upgrade, or need to grow the pointer table */
objset_t *os = zap->zap_objset;
uint64_t object = zap->zap_object;
zap_put_leaf(l);
zap_unlockdir(zap);
err = zap_lockdir(os, object, tx, RW_WRITER,
FALSE, FALSE, &zn->zn_zap);
zap = zn->zn_zap;
if (err)
return (err);
ASSERT(!zap->zap_ismicro);
while (old_prefix_len ==
zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
err = zap_grow_ptrtbl(zap, tx);
if (err)
return (err);
}
err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
if (err)
return (err);
if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
/* it split while our locks were down */
*lp = l;
return (0);
}
}
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
l->l_phys->l_hdr.lh_prefix);
prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
(old_prefix_len + 1);
sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
/* check for i/o errors before doing zap_leaf_split */
for (i = 0; i < (1ULL<<prefix_diff); i++) {
uint64_t blk;
err = zap_idx_to_blk(zap, sibling+i, &blk);
if (err)
return (err);
ASSERT3U(blk, ==, l->l_blkid);
}
nl = zap_create_leaf(zap, tx);
zap_leaf_split(l, nl, zap->zap_normflags != 0);
/* set sibling pointers */
for (i = 0; i < (1ULL<<prefix_diff); i++) {
err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
}
if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
/* we want the sibling */
zap_put_leaf(l);
*lp = nl;
} else {
zap_put_leaf(nl);
*lp = l;
}
return (0);
}
static void
zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
{
zap_t *zap = zn->zn_zap;
int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
zap_put_leaf(l);
if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
int err;
/*
* We are in the middle of growing the pointer table, or
* this leaf will soon make us grow it.
*/
if (zap_tryupgradedir(zap, tx) == 0) {
objset_t *os = zap->zap_objset;
uint64_t zapobj = zap->zap_object;
zap_unlockdir(zap);
err = zap_lockdir(os, zapobj, tx,
RW_WRITER, FALSE, FALSE, &zn->zn_zap);
zap = zn->zn_zap;
if (err)
return;
}
/* could have finished growing while our locks were down */
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
(void) zap_grow_ptrtbl(zap, tx);
}
}
static int
fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
{
if (name && strlen(name) > ZAP_MAXNAMELEN)
return (E2BIG);
/* Only integer sizes supported by C */
switch (integer_size) {
case 1:
case 2:
case 4:
case 8:
break;
default:
return (EINVAL);
}
if (integer_size * num_integers > ZAP_MAXVALUELEN)
return (E2BIG);
return (0);
}
/*
* Routines for manipulating attributes.
*/
int
fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *ncp)
{
zap_leaf_t *l;
int err;
zap_entry_handle_t zeh;
err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
if (err != 0)
return (err);
err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
if (err != 0)
return (err);
err = zap_leaf_lookup(l, zn, &zeh);
if (err == 0) {
err = zap_entry_read(&zeh, integer_size, num_integers, buf);
(void) zap_entry_read_name(&zeh, rn_len, realname);
if (ncp) {
*ncp = zap_entry_normalization_conflict(&zeh,
zn, NULL, zn->zn_zap);
}
}
zap_put_leaf(l);
return (err);
}
int
fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err;
zap_entry_handle_t zeh;
zap_t *zap = zn->zn_zap;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT(!zap->zap_ismicro);
ASSERT(fzap_checksize(zn->zn_name_orij,
integer_size, num_integers) == 0);
err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
if (err != 0)
return (err);
retry:
err = zap_leaf_lookup(l, zn, &zeh);
if (err == 0) {
err = EEXIST;
goto out;
}
if (err != ENOENT)
goto out;
err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
integer_size, num_integers, val, &zeh);
if (err == 0) {
zap_increment_num_entries(zap, 1, tx);
} else if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
}
out:
if (zap != NULL)
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
return (err);
}
int
fzap_add(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
if (err != 0)
return (err);
return (fzap_add_cd(zn, integer_size, num_integers,
val, ZAP_MAXCD, tx));
}
int
fzap_update(zap_name_t *zn,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err, create;
zap_entry_handle_t zeh;
zap_t *zap = zn->zn_zap;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
if (err != 0)
return (err);
err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
if (err != 0)
return (err);
retry:
err = zap_leaf_lookup(l, zn, &zeh);
create = (err == ENOENT);
ASSERT(err == 0 || err == ENOENT);
if (create) {
err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
ZAP_MAXCD, integer_size, num_integers, val, &zeh);
if (err == 0)
zap_increment_num_entries(zap, 1, tx);
} else {
err = zap_entry_update(&zeh, integer_size, num_integers, val);
}
if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
}
if (zap != NULL)
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
return (err);
}
int
fzap_length(zap_name_t *zn,
uint64_t *integer_size, uint64_t *num_integers)
{
zap_leaf_t *l;
int err;
zap_entry_handle_t zeh;
err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
if (err != 0)
return (err);
err = zap_leaf_lookup(l, zn, &zeh);
if (err != 0)
goto out;
if (integer_size)
*integer_size = zeh.zeh_integer_size;
if (num_integers)
*num_integers = zeh.zeh_num_integers;
out:
zap_put_leaf(l);
return (err);
}
int
fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err;
zap_entry_handle_t zeh;
err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
if (err != 0)
return (err);
err = zap_leaf_lookup(l, zn, &zeh);
if (err == 0) {
zap_entry_remove(&zeh);
zap_increment_num_entries(zn->zn_zap, -1, tx);
}
zap_put_leaf(l);
return (err);
}
/*
* Helper functions for consumers.
*/
int
zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
char *name)
{
zap_cursor_t zc;
zap_attribute_t *za;
int err;
if (mask == 0)
mask = -1ULL;
za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
for (zap_cursor_init(&zc, os, zapobj);
(err = zap_cursor_retrieve(&zc, za)) == 0;
zap_cursor_advance(&zc)) {
if ((za->za_first_integer & mask) == (value & mask)) {
(void) strcpy(name, za->za_name);
break;
}
}
zap_cursor_fini(&zc);
kmem_free(za, sizeof (zap_attribute_t));
return (err);
}
int
zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
{
zap_cursor_t zc;
zap_attribute_t za;
int err;
for (zap_cursor_init(&zc, os, fromobj);
zap_cursor_retrieve(&zc, &za) == 0;
(void) zap_cursor_advance(&zc)) {
if (za.za_integer_length != 8 || za.za_num_integers != 1)
return (EINVAL);
err = zap_add(os, intoobj, za.za_name,
8, 1, &za.za_first_integer, tx);
if (err)
return (err);
}
zap_cursor_fini(&zc);
return (0);
}
int
zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
{
char name[20];
(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
return (zap_add(os, obj, name, 8, 1, &value, tx));
}
int
zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
{
char name[20];
(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
return (zap_remove(os, obj, name, tx));
}
int
zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
{
char name[20];
(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
return (zap_lookup(os, obj, name, 8, 1, &value));
}
/*
* Routines for iterating over the attributes.
*/
int
fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
{
int err = ENOENT;
zap_entry_handle_t zeh;
zap_leaf_t *l;
/* retrieve the next entry at or after zc_hash/zc_cd */
/* if no entry, return ENOENT */
if (zc->zc_leaf &&
(ZAP_HASH_IDX(zc->zc_hash,
zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
zap_put_leaf(zc->zc_leaf);
zc->zc_leaf = NULL;
}
again:
if (zc->zc_leaf == NULL) {
err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
&zc->zc_leaf);
if (err != 0)
return (err);
} else {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
}
l = zc->zc_leaf;
err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
if (err == ENOENT) {
uint64_t nocare =
(1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
zc->zc_cd = 0;
if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
zc->zc_hash = -1ULL;
} else {
zap_put_leaf(zc->zc_leaf);
zc->zc_leaf = NULL;
goto again;
}
}
if (err == 0) {
zc->zc_hash = zeh.zeh_hash;
zc->zc_cd = zeh.zeh_cd;
za->za_integer_length = zeh.zeh_integer_size;
za->za_num_integers = zeh.zeh_num_integers;
if (zeh.zeh_num_integers == 0) {
za->za_first_integer = 0;
} else {
err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
ASSERT(err == 0 || err == EOVERFLOW);
}
err = zap_entry_read_name(&zeh,
sizeof (za->za_name), za->za_name);
ASSERT(err == 0);
za->za_normalization_conflict =
zap_entry_normalization_conflict(&zeh,
NULL, za->za_name, zap);
}
rw_exit(&zc->zc_leaf->l_rwlock);
return (err);
}
static void
zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
{
int i, err;
uint64_t lastblk = 0;
/*
* NB: if a leaf has more pointers than an entire ptrtbl block
* can hold, then it'll be accounted for more than once, since
* we won't have lastblk.
*/
for (i = 0; i < len; i++) {
zap_leaf_t *l;
if (tbl[i] == lastblk)
continue;
lastblk = tbl[i];
err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
if (err == 0) {
zap_leaf_stats(zap, l, zs);
zap_put_leaf(l);
}
}
}
void
fzap_get_stats(zap_t *zap, zap_stats_t *zs)
{
int bs = FZAP_BLOCK_SHIFT(zap);
zs->zs_blocksize = 1ULL << bs;
/*
* Set zap_phys_t fields
*/
zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
/*
* Set zap_ptrtbl fields
*/
zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
zs->zs_ptrtbl_blks_copied =
zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
/* the ptrtbl is entirely in the header block. */
zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
} else {
int b;
dmu_prefetch(zap->zap_objset, zap->zap_object,
zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
b++) {
dmu_buf_t *db;
int err;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
FTAG, &db);
if (err == 0) {
zap_stats_ptrtbl(zap, db->db_data,
1<<(bs-3), zs);
dmu_buf_rele(db, FTAG);
}
}
}
}
int
fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
uint64_t *tooverwrite)
{
zap_t *zap = zn->zn_zap;
zap_leaf_t *l;
int err;
/*
* Account for the header block of the fatzap.
*/
if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
*tooverwrite += zap->zap_dbuf->db_size;
} else {
*towrite += zap->zap_dbuf->db_size;
}
/*
* Account for the pointer table blocks.
* If we are adding we need to account for the following cases :
* - If the pointer table is embedded, this operation could force an
* external pointer table.
* - If this already has an external pointer table this operation
* could extend the table.
*/
if (add) {
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
*towrite += zap->zap_dbuf->db_size;
else
*towrite += (zap->zap_dbuf->db_size * 3);
}
/*
* Now, check if the block containing leaf is freeable
* and account accordingly.
*/
err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
if (err != 0) {
return (err);
}
if (!add && dmu_buf_freeable(l->l_dbuf)) {
*tooverwrite += l->l_dbuf->db_size;
} else {
/*
* If this an add operation, the leaf block could split.
* Hence, we need to account for an additional leaf block.
*/
*towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
}
zap_put_leaf(l);
return (0);
}