freebsd-nq/module/zfs/zap_micro.c
Brian Behlendorf 3ec3bc2167 OpenZFS 7793 - ztest fails assertion in dmu_tx_willuse_space
Reviewed by: Steve Gonczi <steve.gonczi@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

Background information: This assertion about tx_space_* verifies that we
are not dirtying more stuff than we thought we would. We “need” to know
how much we will dirty so that we can check if we should fail this
transaction with ENOSPC/EDQUOT, in dmu_tx_assign(). While the
transaction is open (i.e. between dmu_tx_assign() and dmu_tx_commit() —
typically less than a millisecond), we call dbuf_dirty() on the exact
blocks that will be modified. Once this happens, the temporary
accounting in tx_space_* is unnecessary, because we know exactly what
blocks are newly dirtied; we call dnode_willuse_space() to track this
more exact accounting.

The fundamental problem causing this bug is that dmu_tx_hold_*() relies
on the current state in the DMU (e.g. dn_nlevels) to predict how much
will be dirtied by this transaction, but this state can change before we
actually perform the transaction (i.e. call dbuf_dirty()).

This bug will be fixed by removing the assertion that the tx_space_*
accounting is perfectly accurate (i.e. we never dirty more than was
predicted by dmu_tx_hold_*()). By removing the requirement that this
accounting be perfectly accurate, we can also vastly simplify it, e.g.
removing most of the logic in dmu_tx_count_*().

The new tx space accounting will be very approximate, and may be more or
less than what is actually dirtied. It will still be used to determine
if this transaction will put us over quota. Transactions that are marked
by dmu_tx_mark_netfree() will be excepted from this check. We won’t make
an attempt to determine how much space will be freed by the transaction
— this was rarely accurate enough to determine if a transaction should
be permitted when we are over quota, which is why dmu_tx_mark_netfree()
was introduced in 2014.

We also won’t attempt to give “credit” when overwriting existing blocks,
if those blocks may be freed. This allows us to remove the
do_free_accounting logic in dbuf_dirty(), and associated routines. This
logic attempted to predict what will be on disk when this txg syncs, to
know if the overwritten block will be freed (i.e. exists, and has no
snapshots).

OpenZFS-issue: https://www.illumos.org/issues/7793
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3704e0a
Upstream bugs: DLPX-32883a
Closes #5804 

Porting notes:
- DNODE_SIZE replaced with DNODE_MIN_SIZE in dmu_tx_count_dnode(),
  Using the default dnode size would be slightly better.
- DEBUG_DMU_TX wrappers and configure option removed.
- Resolved _by_dnode() conflicts these changes have not yet been
  applied to OpenZFS.
2017-03-07 09:51:59 -08:00

1645 lines
40 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
*/
#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h>
#include <sys/zap.h>
#include <sys/refcount.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
#include <sys/avl.h>
#include <sys/arc.h>
#include <sys/dmu_objset.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
#endif
extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
static int mzap_upgrade(zap_t **zapp,
void *tag, dmu_tx_t *tx, zap_flags_t flags);
uint64_t
zap_getflags(zap_t *zap)
{
if (zap->zap_ismicro)
return (0);
return (zap_f_phys(zap)->zap_flags);
}
int
zap_hashbits(zap_t *zap)
{
if (zap_getflags(zap) & ZAP_FLAG_HASH64)
return (48);
else
return (28);
}
uint32_t
zap_maxcd(zap_t *zap)
{
if (zap_getflags(zap) & ZAP_FLAG_HASH64)
return ((1<<16)-1);
else
return (-1U);
}
static uint64_t
zap_hash(zap_name_t *zn)
{
zap_t *zap = zn->zn_zap;
uint64_t h = 0;
if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
h = *(uint64_t *)zn->zn_key_orig;
} else {
h = zap->zap_salt;
ASSERT(h != 0);
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
int i;
const uint64_t *wp = zn->zn_key_norm;
ASSERT(zn->zn_key_intlen == 8);
for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
int j;
uint64_t word = *wp;
for (j = 0; j < zn->zn_key_intlen; j++) {
h = (h >> 8) ^
zfs_crc64_table[(h ^ word) & 0xFF];
word >>= NBBY;
}
}
} else {
int i, len;
const uint8_t *cp = zn->zn_key_norm;
/*
* We previously stored the terminating null on
* disk, but didn't hash it, so we need to
* continue to not hash it. (The
* zn_key_*_numints includes the terminating
* null for non-binary keys.)
*/
len = zn->zn_key_norm_numints - 1;
ASSERT(zn->zn_key_intlen == 1);
for (i = 0; i < len; cp++, i++) {
h = (h >> 8) ^
zfs_crc64_table[(h ^ *cp) & 0xFF];
}
}
}
/*
* Don't use all 64 bits, since we need some in the cookie for
* the collision differentiator. We MUST use the high bits,
* since those are the ones that we first pay attention to when
* choosing the bucket.
*/
h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
return (h);
}
static int
zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
{
size_t inlen, outlen;
int err;
ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
inlen = strlen(name) + 1;
outlen = ZAP_MAXNAMELEN;
err = 0;
(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
U8_UNICODE_LATEST, &err);
return (err);
}
boolean_t
zap_match(zap_name_t *zn, const char *matchname)
{
ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
if (zn->zn_matchtype & MT_NORMALIZE) {
char norm[ZAP_MAXNAMELEN];
if (zap_normalize(zn->zn_zap, matchname, norm,
zn->zn_normflags) != 0)
return (B_FALSE);
return (strcmp(zn->zn_key_norm, norm) == 0);
} else {
return (strcmp(zn->zn_key_orig, matchname) == 0);
}
}
void
zap_name_free(zap_name_t *zn)
{
kmem_free(zn, sizeof (zap_name_t));
}
zap_name_t *
zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
{
zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
zn->zn_zap = zap;
zn->zn_key_intlen = sizeof (*key);
zn->zn_key_orig = key;
zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
zn->zn_matchtype = mt;
zn->zn_normflags = zap->zap_normflags;
/*
* If we're dealing with a case sensitive lookup on a mixed or
* insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
* will fold case to all caps overriding the lookup request.
*/
if (mt & MT_MATCH_CASE)
zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
if (zap->zap_normflags) {
/*
* We *must* use zap_normflags because this normalization is
* what the hash is computed from.
*/
if (zap_normalize(zap, key, zn->zn_normbuf,
zap->zap_normflags) != 0) {
zap_name_free(zn);
return (NULL);
}
zn->zn_key_norm = zn->zn_normbuf;
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
} else {
if (mt != 0) {
zap_name_free(zn);
return (NULL);
}
zn->zn_key_norm = zn->zn_key_orig;
zn->zn_key_norm_numints = zn->zn_key_orig_numints;
}
zn->zn_hash = zap_hash(zn);
if (zap->zap_normflags != zn->zn_normflags) {
/*
* We *must* use zn_normflags because this normalization is
* what the matching is based on. (Not the hash!)
*/
if (zap_normalize(zap, key, zn->zn_normbuf,
zn->zn_normflags) != 0) {
zap_name_free(zn);
return (NULL);
}
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
}
return (zn);
}
zap_name_t *
zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
{
zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
ASSERT(zap->zap_normflags == 0);
zn->zn_zap = zap;
zn->zn_key_intlen = sizeof (*key);
zn->zn_key_orig = zn->zn_key_norm = key;
zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
zn->zn_matchtype = 0;
zn->zn_hash = zap_hash(zn);
return (zn);
}
static void
mzap_byteswap(mzap_phys_t *buf, size_t size)
{
int i, max;
buf->mz_block_type = BSWAP_64(buf->mz_block_type);
buf->mz_salt = BSWAP_64(buf->mz_salt);
buf->mz_normflags = BSWAP_64(buf->mz_normflags);
max = (size / MZAP_ENT_LEN) - 1;
for (i = 0; i < max; i++) {
buf->mz_chunk[i].mze_value =
BSWAP_64(buf->mz_chunk[i].mze_value);
buf->mz_chunk[i].mze_cd =
BSWAP_32(buf->mz_chunk[i].mze_cd);
}
}
void
zap_byteswap(void *buf, size_t size)
{
uint64_t block_type;
block_type = *(uint64_t *)buf;
if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
/* ASSERT(magic == ZAP_LEAF_MAGIC); */
mzap_byteswap(buf, size);
} else {
fzap_byteswap(buf, size);
}
}
static int
mze_compare(const void *arg1, const void *arg2)
{
const mzap_ent_t *mze1 = arg1;
const mzap_ent_t *mze2 = arg2;
int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
if (likely(cmp))
return (cmp);
return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
}
static void
mze_insert(zap_t *zap, int chunkid, uint64_t hash)
{
mzap_ent_t *mze;
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
mze->mze_chunkid = chunkid;
mze->mze_hash = hash;
mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
avl_add(&zap->zap_m.zap_avl, mze);
}
static mzap_ent_t *
mze_find(zap_name_t *zn)
{
mzap_ent_t mze_tofind;
mzap_ent_t *mze;
avl_index_t idx;
avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
ASSERT(zn->zn_zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
mze_tofind.mze_hash = zn->zn_hash;
mze_tofind.mze_cd = 0;
mze = avl_find(avl, &mze_tofind, &idx);
if (mze == NULL)
mze = avl_nearest(avl, idx, AVL_AFTER);
for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
return (mze);
}
return (NULL);
}
static uint32_t
mze_find_unused_cd(zap_t *zap, uint64_t hash)
{
mzap_ent_t mze_tofind;
mzap_ent_t *mze;
avl_index_t idx;
avl_tree_t *avl = &zap->zap_m.zap_avl;
uint32_t cd;
ASSERT(zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
mze_tofind.mze_hash = hash;
mze_tofind.mze_cd = 0;
cd = 0;
for (mze = avl_find(avl, &mze_tofind, &idx);
mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
if (mze->mze_cd != cd)
break;
cd++;
}
return (cd);
}
static void
mze_remove(zap_t *zap, mzap_ent_t *mze)
{
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
avl_remove(&zap->zap_m.zap_avl, mze);
kmem_free(mze, sizeof (mzap_ent_t));
}
static void
mze_destroy(zap_t *zap)
{
mzap_ent_t *mze;
void *avlcookie = NULL;
while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
kmem_free(mze, sizeof (mzap_ent_t));
avl_destroy(&zap->zap_m.zap_avl);
}
static zap_t *
mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
{
zap_t *winner;
zap_t *zap;
int i;
uint64_t *zap_hdr = (uint64_t *)db->db_data;
uint64_t zap_block_type = zap_hdr[0];
uint64_t zap_magic = zap_hdr[1];
ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
rw_enter(&zap->zap_rwlock, RW_WRITER);
zap->zap_objset = os;
zap->zap_object = obj;
zap->zap_dbuf = db;
if (zap_block_type != ZBT_MICRO) {
mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
winner = NULL; /* No actual winner here... */
goto handle_winner;
}
} else {
zap->zap_ismicro = TRUE;
}
/*
* Make sure that zap_ismicro is set before we let others see
* it, because zap_lockdir() checks zap_ismicro without the lock
* held.
*/
dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
winner = dmu_buf_set_user(db, &zap->zap_dbu);
if (winner != NULL)
goto handle_winner;
if (zap->zap_ismicro) {
zap->zap_salt = zap_m_phys(zap)->mz_salt;
zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
avl_create(&zap->zap_m.zap_avl, mze_compare,
sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze =
&zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0]) {
zap_name_t *zn;
zap->zap_m.zap_num_entries++;
zn = zap_name_alloc(zap, mze->mze_name, 0);
mze_insert(zap, i, zn->zn_hash);
zap_name_free(zn);
}
}
} else {
zap->zap_salt = zap_f_phys(zap)->zap_salt;
zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
ASSERT3U(sizeof (struct zap_leaf_header), ==,
2*ZAP_LEAF_CHUNKSIZE);
/*
* The embedded pointer table should not overlap the
* other members.
*/
ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
&zap_f_phys(zap)->zap_salt);
/*
* The embedded pointer table should end at the end of
* the block
*/
ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
(uintptr_t)zap_f_phys(zap), ==,
zap->zap_dbuf->db_size);
}
rw_exit(&zap->zap_rwlock);
return (zap);
handle_winner:
rw_exit(&zap->zap_rwlock);
rw_destroy(&zap->zap_rwlock);
if (!zap->zap_ismicro)
mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
kmem_free(zap, sizeof (zap_t));
return (winner);
}
static int
zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
{
dmu_object_info_t doi;
zap_t *zap;
krw_t lt;
objset_t *os = dmu_buf_get_objset(db);
uint64_t obj = db->db_object;
ASSERT0(db->db_offset);
*zapp = NULL;
dmu_object_info_from_db(db, &doi);
if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
return (SET_ERROR(EINVAL));
zap = dmu_buf_get_user(db);
if (zap == NULL) {
zap = mzap_open(os, obj, db);
if (zap == NULL) {
/*
* mzap_open() didn't like what it saw on-disk.
* Check for corruption!
*/
return (SET_ERROR(EIO));
}
}
/*
* We're checking zap_ismicro without the lock held, in order to
* tell what type of lock we want. Once we have some sort of
* lock, see if it really is the right type. In practice this
* can only be different if it was upgraded from micro to fat,
* and micro wanted WRITER but fat only needs READER.
*/
lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
rw_enter(&zap->zap_rwlock, lt);
if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
/* it was upgraded, now we only need reader */
ASSERT(lt == RW_WRITER);
ASSERT(RW_READER ==
((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
rw_downgrade(&zap->zap_rwlock);
lt = RW_READER;
}
zap->zap_objset = os;
if (lt == RW_WRITER)
dmu_buf_will_dirty(db, tx);
ASSERT3P(zap->zap_dbuf, ==, db);
ASSERT(!zap->zap_ismicro ||
zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
if (zap->zap_ismicro && tx && adding &&
zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
if (newsz > MZAP_MAX_BLKSZ) {
int err;
dprintf("upgrading obj %llu: num_entries=%u\n",
obj, zap->zap_m.zap_num_entries);
*zapp = zap;
err = mzap_upgrade(zapp, tag, tx, 0);
if (err != 0)
rw_exit(&zap->zap_rwlock);
return (err);
}
VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
zap->zap_m.zap_num_chunks =
db->db_size / MZAP_ENT_LEN - 1;
}
*zapp = zap;
return (0);
}
static int
zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
{
dmu_buf_t *db;
int err;
err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0) {
return (err);
}
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0) {
dmu_buf_rele(db, tag);
}
return (err);
}
int
zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
{
dmu_buf_t *db;
int err;
err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0)
dmu_buf_rele(db, tag);
return (err);
}
void
zap_unlockdir(zap_t *zap, void *tag)
{
rw_exit(&zap->zap_rwlock);
dmu_buf_rele(zap->zap_dbuf, tag);
}
static int
mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
{
mzap_phys_t *mzp;
int i, sz, nchunks;
int err = 0;
zap_t *zap = *zapp;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
sz = zap->zap_dbuf->db_size;
mzp = vmem_alloc(sz, KM_SLEEP);
bcopy(zap->zap_dbuf->db_data, mzp, sz);
nchunks = zap->zap_m.zap_num_chunks;
if (!flags) {
err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
1ULL << fzap_default_block_shift, 0, tx);
if (err) {
vmem_free(mzp, sz);
return (err);
}
}
dprintf("upgrading obj=%llu with %u chunks\n",
zap->zap_object, nchunks);
/* XXX destroy the avl later, so we can use the stored hash value */
mze_destroy(zap);
fzap_upgrade(zap, tx, flags);
for (i = 0; i < nchunks; i++) {
mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
zap_name_t *zn;
if (mze->mze_name[0] == 0)
continue;
dprintf("adding %s=%llu\n",
mze->mze_name, mze->mze_value);
zn = zap_name_alloc(zap, mze->mze_name, 0);
err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
tag, tx);
zap = zn->zn_zap; /* fzap_add_cd() may change zap */
zap_name_free(zn);
if (err)
break;
}
vmem_free(mzp, sz);
*zapp = zap;
return (err);
}
/*
* The "normflags" determine the behavior of the matchtype_t which is
* passed to zap_lookup_norm(). Names which have the same normalized
* version will be stored with the same hash value, and therefore we can
* perform normalization-insensitive lookups. We can be Unicode form-
* insensitive and/or case-insensitive. The following flags are valid for
* "normflags":
*
* U8_TEXTPREP_NFC
* U8_TEXTPREP_NFD
* U8_TEXTPREP_NFKC
* U8_TEXTPREP_NFKD
* U8_TEXTPREP_TOUPPER
*
* The *_NF* (Normalization Form) flags are mutually exclusive; at most one
* of them may be supplied.
*/
void
mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
dmu_tx_t *tx)
{
dmu_buf_t *db;
mzap_phys_t *zp;
VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
dmu_object_info_from_db(db, &doi);
ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
}
#endif
dmu_buf_will_dirty(db, tx);
zp = db->db_data;
zp->mz_block_type = ZBT_MICRO;
zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
zp->mz_normflags = normflags;
dmu_buf_rele(db, FTAG);
if (flags != 0) {
zap_t *zap;
/* Only fat zap supports flags; upgrade immediately. */
VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
B_FALSE, B_FALSE, FTAG, &zap));
VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
zap_unlockdir(zap, FTAG);
}
}
int
zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
0, tx));
}
int
zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
{
return (zap_create_claim_norm_dnsize(os, obj,
0, ot, bonustype, bonuslen, dnodesize, tx));
}
int
zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
bonuslen, 0, tx));
}
int
zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
int dnodesize, dmu_tx_t *tx)
{
int err;
err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
dnodesize, tx);
if (err != 0)
return (err);
mzap_create_impl(os, obj, normflags, 0, tx);
return (0);
}
uint64_t
zap_create(objset_t *os, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
}
uint64_t
zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
{
return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
dnodesize, tx));
}
uint64_t
zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
0, tx));
}
uint64_t
zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
{
uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
dnodesize, tx);
mzap_create_impl(os, obj, normflags, 0, tx);
return (obj);
}
uint64_t
zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (zap_create_flags_dnsize(os, normflags, flags, ot,
leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
}
uint64_t
zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
{
uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
dnodesize, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
indirect_blockshift >= SPA_MINBLOCKSHIFT &&
indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
VERIFY(dmu_object_set_blocksize(os, obj,
1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
mzap_create_impl(os, obj, normflags, flags, tx);
return (obj);
}
int
zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
{
/*
* dmu_object_free will free the object number and free the
* data. Freeing the data will cause our pageout function to be
* called, which will destroy our data (zap_leaf_t's and zap_t).
*/
return (dmu_object_free(os, zapobj, tx));
}
void
zap_evict_sync(void *dbu)
{
zap_t *zap = dbu;
rw_destroy(&zap->zap_rwlock);
if (zap->zap_ismicro)
mze_destroy(zap);
else
mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
kmem_free(zap, sizeof (zap_t));
}
int
zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
{
zap_t *zap;
int err;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
if (!zap->zap_ismicro) {
err = fzap_count(zap, count);
} else {
*count = zap->zap_m.zap_num_entries;
}
zap_unlockdir(zap, FTAG);
return (err);
}
/*
* zn may be NULL; if not specified, it will be computed if needed.
* See also the comment above zap_entry_normalization_conflict().
*/
static boolean_t
mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
{
mzap_ent_t *other;
int direction = AVL_BEFORE;
boolean_t allocdzn = B_FALSE;
if (zap->zap_normflags == 0)
return (B_FALSE);
again:
for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
other && other->mze_hash == mze->mze_hash;
other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
if (zn == NULL) {
zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
MT_NORMALIZE);
allocdzn = B_TRUE;
}
if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
}
}
if (direction == AVL_BEFORE) {
direction = AVL_AFTER;
goto again;
}
if (allocdzn)
zap_name_free(zn);
return (B_FALSE);
}
/*
* Routines for manipulating attributes.
*/
int
zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf)
{
return (zap_lookup_norm(os, zapobj, name, integer_size,
num_integers, buf, 0, NULL, 0, NULL));
}
static int
zap_lookup_impl(zap_t *zap, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp)
{
int err = 0;
mzap_ent_t *mze;
zap_name_t *zn;
zn = zap_name_alloc(zap, name, mt);
if (zn == NULL)
return (SET_ERROR(ENOTSUP));
if (!zap->zap_ismicro) {
err = fzap_lookup(zn, integer_size, num_integers, buf,
realname, rn_len, ncp);
} else {
mze = mze_find(zn);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
if (num_integers < 1) {
err = SET_ERROR(EOVERFLOW);
} else if (integer_size != 8) {
err = SET_ERROR(EINVAL);
} else {
*(uint64_t *)buf =
MZE_PHYS(zap, mze)->mze_value;
(void) strlcpy(realname,
MZE_PHYS(zap, mze)->mze_name, rn_len);
if (ncp) {
*ncp = mzap_normalization_conflict(zap,
zn, mze);
}
}
}
}
zap_name_free(zn);
return (err);
}
int
zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp)
{
zap_t *zap;
int err;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_impl(zap, name, integer_size,
num_integers, buf, mt, realname, rn_len, ncp);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
{
zap_t *zap;
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
fzap_prefetch(zn);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_lookup_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf)
{
return (zap_lookup_norm_by_dnode(dn, name, integer_size,
num_integers, buf, 0, NULL, 0, NULL));
}
int
zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp)
{
zap_t *zap;
int err;
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_impl(zap, name, integer_size,
num_integers, buf, mt, realname, rn_len, ncp);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints)
{
zap_t *zap;
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
fzap_prefetch(zn);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
{
zap_t *zap;
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_lookup(zn, integer_size, num_integers, buf,
NULL, 0, NULL);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_contains(objset_t *os, uint64_t zapobj, const char *name)
{
int err = zap_lookup_norm(os, zapobj, name, 0,
0, NULL, 0, NULL, 0, NULL);
if (err == EOVERFLOW || err == EINVAL)
err = 0; /* found, but skipped reading the value */
return (err);
}
int
zap_length(objset_t *os, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers)
{
zap_t *zap;
int err;
mzap_ent_t *mze;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
err = fzap_length(zn, integer_size, num_integers);
} else {
mze = mze_find(zn);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
if (integer_size)
*integer_size = 8;
if (num_integers)
*num_integers = 1;
}
}
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t *integer_size, uint64_t *num_integers)
{
zap_t *zap;
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_length(zn, integer_size, num_integers);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
}
static void
mzap_addent(zap_name_t *zn, uint64_t value)
{
int i;
zap_t *zap = zn->zn_zap;
int start = zap->zap_m.zap_alloc_next;
uint32_t cd;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
#ifdef ZFS_DEBUG
for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
ASSERTV(mzap_ent_phys_t *mze);
ASSERT(mze = &zap_m_phys(zap)->mz_chunk[i]);
ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
}
#endif
cd = mze_find_unused_cd(zap, zn->zn_hash);
/* given the limited size of the microzap, this can't happen */
ASSERT(cd < zap_maxcd(zap));
again:
for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0] == 0) {
mze->mze_value = value;
mze->mze_cd = cd;
(void) strlcpy(mze->mze_name, zn->zn_key_orig,
sizeof (mze->mze_name));
zap->zap_m.zap_num_entries++;
zap->zap_m.zap_alloc_next = i+1;
if (zap->zap_m.zap_alloc_next ==
zap->zap_m.zap_num_chunks)
zap->zap_m.zap_alloc_next = 0;
mze_insert(zap, i, zn->zn_hash);
return;
}
}
if (start != 0) {
start = 0;
goto again;
}
cmn_err(CE_PANIC, "out of entries!");
}
static int
zap_add_impl(zap_t *zap, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx, void *tag)
{
int err = 0;
mzap_ent_t *mze;
const uint64_t *intval = val;
zap_name_t *zn;
zn = zap_name_alloc(zap, key, 0);
if (zn == NULL) {
zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(key) >= MZAP_NAME_LEN) {
err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
if (err == 0) {
err = fzap_add(zn, integer_size, num_integers, val,
tag, tx);
}
zap = zn->zn_zap; /* fzap_add() may change zap */
} else {
mze = mze_find(zn);
if (mze != NULL) {
err = SET_ERROR(EEXIST);
} else {
mzap_addent(zn, *intval);
}
}
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlockdir(zap, tag);
return (err);
}
int
zap_add(objset_t *os, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
zap_t *zap;
int err;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
/* zap_add_impl() calls zap_unlockdir() */
return (err);
}
int
zap_add_by_dnode(dnode_t *dn, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
zap_t *zap;
int err;
err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
/* zap_add_impl() calls zap_unlockdir() */
return (err);
}
int
zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
zap_t *zap;
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_update(objset_t *os, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_t *zap;
mzap_ent_t *mze;
const uint64_t *intval = val;
zap_name_t *zn;
int err;
#ifdef ZFS_DEBUG
uint64_t oldval;
/*
* If there is an old value, it shouldn't change across the
* lockdir (eg, due to bprewrite's xlation).
*/
if (integer_size == 8 && num_integers == 1)
(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
#endif
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
err = fzap_update(zn, integer_size, num_integers, val,
FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(name) >= MZAP_NAME_LEN) {
dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
zapobj, integer_size, num_integers, name);
err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
if (err == 0) {
err = fzap_update(zn, integer_size, num_integers,
val, FTAG, tx);
}
zap = zn->zn_zap; /* fzap_update() may change zap */
} else {
mze = mze_find(zn);
if (mze != NULL) {
ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
MZE_PHYS(zap, mze)->mze_value = *intval;
} else {
mzap_addent(zn, *intval);
}
}
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_t *zap;
zap_name_t *zn;
int err;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
{
return (zap_remove_norm(os, zapobj, name, 0, tx));
}
static int
zap_remove_impl(zap_t *zap, const char *name,
matchtype_t mt, dmu_tx_t *tx)
{
mzap_ent_t *mze;
zap_name_t *zn;
int err = 0;
zn = zap_name_alloc(zap, name, mt);
if (zn == NULL)
return (SET_ERROR(ENOTSUP));
if (!zap->zap_ismicro) {
err = fzap_remove(zn, tx);
} else {
mze = mze_find(zn);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
zap->zap_m.zap_num_entries--;
bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
sizeof (mzap_ent_phys_t));
mze_remove(zap, mze);
}
}
zap_name_free(zn);
return (err);
}
int
zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
matchtype_t mt, dmu_tx_t *tx)
{
zap_t *zap;
int err;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
err = zap_remove_impl(zap, name, mt, tx);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
{
zap_t *zap;
int err;
err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
err = zap_remove_impl(zap, name, 0, tx);
zap_unlockdir(zap, FTAG);
return (err);
}
int
zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx)
{
zap_t *zap;
int err;
zap_name_t *zn;
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_remove(zn, tx);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
return (err);
}
/*
* Routines for iterating over the attributes.
*/
void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
{
zc->zc_objset = os;
zc->zc_zap = NULL;
zc->zc_leaf = NULL;
zc->zc_zapobj = zapobj;
zc->zc_serialized = serialized;
zc->zc_hash = 0;
zc->zc_cd = 0;
}
void
zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{
zap_cursor_init_serialized(zc, os, zapobj, 0);
}
void
zap_cursor_fini(zap_cursor_t *zc)
{
if (zc->zc_zap) {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
zap_unlockdir(zc->zc_zap, NULL);
zc->zc_zap = NULL;
}
if (zc->zc_leaf) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
zap_put_leaf(zc->zc_leaf);
zc->zc_leaf = NULL;
}
zc->zc_objset = NULL;
}
uint64_t
zap_cursor_serialize(zap_cursor_t *zc)
{
if (zc->zc_hash == -1ULL)
return (-1ULL);
if (zc->zc_zap == NULL)
return (zc->zc_serialized);
ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
/*
* We want to keep the high 32 bits of the cursor zero if we can, so
* that 32-bit programs can access this. So usually use a small
* (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
* of the cursor.
*
* [ collision differentiator | zap_hashbits()-bit hash value ]
*/
return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
}
int
zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
{
int err;
avl_index_t idx;
mzap_ent_t mze_tofind;
mzap_ent_t *mze;
if (zc->zc_hash == -1ULL)
return (SET_ERROR(ENOENT));
if (zc->zc_zap == NULL) {
int hb;
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
if (err)
return (err);
/*
* To support zap_cursor_init_serialized, advance, retrieve,
* we must add to the existing zc_cd, which may already
* be 1 due to the zap_cursor_advance.
*/
ASSERT(zc->zc_hash == 0);
hb = zap_hashbits(zc->zc_zap);
zc->zc_hash = zc->zc_serialized << (64 - hb);
zc->zc_cd += zc->zc_serialized >> hb;
if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
zc->zc_cd = 0;
} else {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
}
if (!zc->zc_zap->zap_ismicro) {
err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
} else {
mze_tofind.mze_hash = zc->zc_hash;
mze_tofind.mze_cd = zc->zc_cd;
mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
if (mze == NULL) {
mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
idx, AVL_AFTER);
}
if (mze) {
mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
za->za_normalization_conflict =
mzap_normalization_conflict(zc->zc_zap, NULL, mze);
za->za_integer_length = 8;
za->za_num_integers = 1;
za->za_first_integer = mzep->mze_value;
(void) strcpy(za->za_name, mzep->mze_name);
zc->zc_hash = mze->mze_hash;
zc->zc_cd = mze->mze_cd;
err = 0;
} else {
zc->zc_hash = -1ULL;
err = SET_ERROR(ENOENT);
}
}
rw_exit(&zc->zc_zap->zap_rwlock);
return (err);
}
void
zap_cursor_advance(zap_cursor_t *zc)
{
if (zc->zc_hash == -1ULL)
return;
zc->zc_cd++;
}
int
zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
{
int err;
zap_t *zap;
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
bzero(zs, sizeof (zap_stats_t));
if (zap->zap_ismicro) {
zs->zs_blocksize = zap->zap_dbuf->db_size;
zs->zs_num_entries = zap->zap_m.zap_num_entries;
zs->zs_num_blocks = 1;
} else {
fzap_get_stats(zap, zs);
}
zap_unlockdir(zap, FTAG);
return (0);
}
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(zap_create);
EXPORT_SYMBOL(zap_create_dnsize);
EXPORT_SYMBOL(zap_create_norm);
EXPORT_SYMBOL(zap_create_norm_dnsize);
EXPORT_SYMBOL(zap_create_flags);
EXPORT_SYMBOL(zap_create_flags_dnsize);
EXPORT_SYMBOL(zap_create_claim);
EXPORT_SYMBOL(zap_create_claim_norm);
EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
EXPORT_SYMBOL(zap_destroy);
EXPORT_SYMBOL(zap_lookup);
EXPORT_SYMBOL(zap_lookup_by_dnode);
EXPORT_SYMBOL(zap_lookup_norm);
EXPORT_SYMBOL(zap_lookup_uint64);
EXPORT_SYMBOL(zap_contains);
EXPORT_SYMBOL(zap_prefetch);
EXPORT_SYMBOL(zap_prefetch_uint64);
EXPORT_SYMBOL(zap_add);
EXPORT_SYMBOL(zap_add_by_dnode);
EXPORT_SYMBOL(zap_add_uint64);
EXPORT_SYMBOL(zap_update);
EXPORT_SYMBOL(zap_update_uint64);
EXPORT_SYMBOL(zap_length);
EXPORT_SYMBOL(zap_length_uint64);
EXPORT_SYMBOL(zap_remove);
EXPORT_SYMBOL(zap_remove_by_dnode);
EXPORT_SYMBOL(zap_remove_norm);
EXPORT_SYMBOL(zap_remove_uint64);
EXPORT_SYMBOL(zap_count);
EXPORT_SYMBOL(zap_value_search);
EXPORT_SYMBOL(zap_join);
EXPORT_SYMBOL(zap_join_increment);
EXPORT_SYMBOL(zap_add_int);
EXPORT_SYMBOL(zap_remove_int);
EXPORT_SYMBOL(zap_lookup_int);
EXPORT_SYMBOL(zap_increment_int);
EXPORT_SYMBOL(zap_add_int_key);
EXPORT_SYMBOL(zap_lookup_int_key);
EXPORT_SYMBOL(zap_increment);
EXPORT_SYMBOL(zap_cursor_init);
EXPORT_SYMBOL(zap_cursor_fini);
EXPORT_SYMBOL(zap_cursor_retrieve);
EXPORT_SYMBOL(zap_cursor_advance);
EXPORT_SYMBOL(zap_cursor_serialize);
EXPORT_SYMBOL(zap_cursor_init_serialized);
EXPORT_SYMBOL(zap_get_stats);
#endif