Update vendor/illumos/dist and vendor/illumos-gate/dist

to illumos-gate revision 13794:7c5e0e746b2c

Obtained from:	ssh://anonhg@hg.illumos.org/illumos-gate
This commit is contained in:
Martin Matuska 2012-09-04 21:58:22 +00:00
parent c9b711eab6
commit 1034179220
16 changed files with 238 additions and 57 deletions

View File

@ -5835,6 +5835,8 @@ main(int argc, char **argv)
(void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
if (!ischild) {
process_options(argc, argv);

View File

@ -61,6 +61,7 @@ extern "C" {
#include <atomic.h>
#include <dirent.h>
#include <time.h>
#include <procfs.h>
#include <libsysevent.h>
#include <sys/note.h>
#include <sys/types.h>

View File

@ -135,6 +135,12 @@
#include <sys/kstat.h>
#include <zfs_fletcher.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
boolean_t arc_watch = B_FALSE;
int arc_procfd;
#endif
static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
@ -474,6 +480,7 @@ static void arc_get_data_buf(arc_buf_t *buf);
static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
static int arc_evict_needed(arc_buf_contents_t type);
static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
static void arc_buf_watch(arc_buf_t *buf);
static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
@ -949,6 +956,50 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum);
mutex_exit(&buf->b_hdr->b_freeze_lock);
arc_buf_watch(buf);
}
#ifndef _KERNEL
typedef struct procctl {
long cmd;
prwatch_t prwatch;
} procctl_t;
#endif
/* ARGSUSED */
static void
arc_buf_unwatch(arc_buf_t *buf)
{
#ifndef _KERNEL
if (arc_watch) {
int result;
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
ctl.prwatch.pr_size = 0;
ctl.prwatch.pr_wflags = 0;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
}
#endif
}
/* ARGSUSED */
static void
arc_buf_watch(arc_buf_t *buf)
{
#ifndef _KERNEL
if (arc_watch) {
int result;
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
ctl.prwatch.pr_size = buf->b_hdr->b_size;
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
}
#endif
}
void
@ -975,6 +1026,8 @@ arc_buf_thaw(arc_buf_t *buf)
}
mutex_exit(&buf->b_hdr->b_freeze_lock);
arc_buf_unwatch(buf);
}
void
@ -992,6 +1045,7 @@ arc_buf_freeze(arc_buf_t *buf)
buf->b_hdr->b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
mutex_exit(hash_lock);
}
static void
@ -1348,21 +1402,22 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
* the buffer is placed on l2arc_free_on_write to be freed later.
*/
static void
arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
void *data, size_t size)
arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
{
arc_buf_hdr_t *hdr = buf->b_hdr;
if (HDR_L2_WRITING(hdr)) {
l2arc_data_free_t *df;
df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
df->l2df_data = data;
df->l2df_size = size;
df->l2df_data = buf->b_data;
df->l2df_size = hdr->b_size;
df->l2df_func = free_func;
mutex_enter(&l2arc_free_on_write_mtx);
list_insert_head(l2arc_free_on_write, df);
mutex_exit(&l2arc_free_on_write_mtx);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
free_func(data, size);
free_func(buf->b_data, hdr->b_size);
}
}
@ -1378,16 +1433,15 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free,
buf->b_data, size);
arc_buf_data_free(buf, zio_buf_free);
arc_space_return(size, ARC_SPACE_DATA);
} else {
ASSERT(type == ARC_BUFC_DATA);
arc_buf_data_free(buf->b_hdr,
zio_data_buf_free, buf->b_data, size);
arc_buf_data_free(buf, zio_data_buf_free);
ARCSTAT_INCR(arcstat_data_size, -size);
atomic_add_64(&arc_size, -size);
}
@ -2556,6 +2610,7 @@ arc_read_done(zio_t *zio)
}
arc_cksum_compute(buf, B_FALSE);
arc_buf_watch(buf);
if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
/*
@ -3113,6 +3168,7 @@ arc_release(arc_buf_t *buf, void *tag)
}
hdr->b_datacnt -= 1;
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
mutex_exit(hash_lock);

View File

@ -189,7 +189,8 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
break;
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST,
bte.be_birth_txg, &bte.be_zb,
TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST,
bptree_visit_cb, &ba);
if (free) {
ASSERT(err == 0 || err == ERESTART);

View File

@ -63,6 +63,8 @@ typedef struct traverse_data {
static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object);
static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
arc_buf_t *buf, uint64_t objset, uint64_t object);
static int
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
@ -178,9 +180,34 @@ traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
bcopy(zb, td->td_resume, sizeof (*td->td_resume));
}
static void
traverse_prefetch_metadata(traverse_data_t *td,
arc_buf_t *pbuf, const blkptr_t *bp, const zbookmark_t *zb)
{
uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return;
/*
* If we are in the process of resuming, don't prefetch, because
* some children will not be needed (and in fact may have already
* been freed).
*/
if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
return;
if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
return;
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return;
(void) arc_read(NULL, td->td_spa, bp,
pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL, &flags, zb);
}
static int
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
arc_buf_t *pbuf, const blkptr_t *bp, const zbookmark_t *zb)
{
zbookmark_t czb;
int err = 0, lasterr = 0;
@ -243,14 +270,21 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
return (err);
/* recursively visitbp() blocks below this */
cbp = buf->b_data;
for (i = 0; i < epb; i++, cbp++) {
for (i = 0; i < epb; i++) {
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp, buf, cbp, &czb);
traverse_prefetch_metadata(td, buf, &cbp[i], &czb);
}
/* recursively visitbp() blocks below this */
for (i = 0; i < epb; i++) {
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp, buf, &cbp[i], &czb);
if (err) {
if (!hard)
break;
@ -267,11 +301,16 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
return (err);
dnp = buf->b_data;
for (i = 0; i < epb; i++) {
prefetch_dnode_metadata(td, &dnp[i], buf, zb->zb_objset,
zb->zb_blkid * epb + i);
}
/* recursively visitbp() blocks below this */
dnp = buf->b_data;
for (i = 0; i < epb; i++, dnp++) {
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
for (i = 0; i < epb; i++) {
err = traverse_dnode(td, &dnp[i], buf, zb->zb_objset,
zb->zb_blkid * epb + i);
if (err) {
if (!hard)
@ -292,6 +331,15 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
osp = buf->b_data;
dnp = &osp->os_meta_dnode;
prefetch_dnode_metadata(td, dnp, buf, zb->zb_objset,
DMU_META_DNODE_OBJECT);
if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
prefetch_dnode_metadata(td, &osp->os_userused_dnode,
buf, zb->zb_objset, DMU_USERUSED_OBJECT);
prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
buf, zb->zb_objset, DMU_USERUSED_OBJECT);
}
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
DMU_META_DNODE_OBJECT);
if (err && hard) {
@ -334,6 +382,24 @@ post:
return (err != 0 ? err : lasterr);
}
static void
prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object)
{
int j;
zbookmark_t czb;
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
traverse_prefetch_metadata(td, buf, &dnp->dn_blkptr[j], &czb);
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
traverse_prefetch_metadata(td, buf, &dnp->dn_spill, &czb);
}
}
static int
traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object)
@ -344,8 +410,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, buf,
(blkptr_t *)&dnp->dn_blkptr[j], &czb);
err = traverse_visitbp(td, dnp, buf, &dnp->dn_blkptr[j], &czb);
if (err) {
if (!hard)
break;
@ -354,10 +419,8 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset,
object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, buf,
(blkptr_t *)&dnp->dn_spill, &czb);
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, buf, &dnp->dn_spill, &czb);
if (err) {
if (!hard)
return (err);
@ -438,6 +501,12 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
ASSERT(ds == NULL || objset == ds->ds_object);
ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
/*
* The data prefetching mechanism (the prefetch thread) is incompatible
* with resuming from a bookmark.
*/
ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
td.td_spa = spa;
td.td_objset = objset;
td.td_rootbp = rootbp;
@ -464,7 +533,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
traverse_zil(&td, &os->os_zil_header);
}
if (!(flags & TRAVERSE_PREFETCH) ||
if (!(flags & TRAVERSE_PREFETCH_DATA) ||
0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
&td, TQ_NOQUEUE))
pd.pd_exited = B_TRUE;

View File

@ -429,6 +429,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
int epbs;
uint64_t l0span = 0, nl1blks = 0;
if (dn->dn_nlevels == 0)
return;
@ -461,6 +462,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
nblks = dn->dn_maxblkid - blkid;
}
l0span = nblks; /* save for later use to calc level > 1 overhead */
if (dn->dn_nlevels == 1) {
int i;
for (i = 0; i < nblks; i++) {
@ -473,24 +475,10 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
}
unref += BP_GET_ASIZE(bp);
}
nl1blks = 1;
nblks = 0;
}
/*
* Add in memory requirements of higher-level indirects.
* This assumes a worst-possible scenario for dn_nlevels.
*/
{
uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
int level = (dn->dn_nlevels > 1) ? 2 : 1;
while (level++ < DN_MAX_LEVELS) {
txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
blkcnt = 1 + (blkcnt >> epbs);
}
ASSERT(blkcnt <= dn->dn_nblkptr);
}
lastblk = blkid + nblks - 1;
while (nblks) {
dmu_buf_impl_t *dbuf;
@ -561,11 +549,35 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
}
dbuf_rele(dbuf, FTAG);
++nl1blks;
blkid += tochk;
nblks -= tochk;
}
rw_exit(&dn->dn_struct_rwlock);
/*
* Add in memory requirements of higher-level indirects.
* This assumes a worst-possible scenario for dn_nlevels and a
* worst-possible distribution of l1-blocks over the region to free.
*/
{
uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
int level = 2;
/*
* Here we don't use DN_MAX_LEVEL, but calculate it with the
* given datablkshift and indblkshift. This makes the
* difference between 19 and 8 on large files.
*/
int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
(dn->dn_indblkshift - SPA_BLKPTRSHIFT);
while (level++ < maxlevel) {
txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs))
<< dn->dn_indblkshift;
blkcnt = 1 + (blkcnt >> epbs);
}
}
/* account for new level 1 indirect blocks that might show up */
if (skipped > 0) {
txh->txh_fudge += skipped << dn->dn_indblkshift;

View File

@ -2302,7 +2302,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
}
}
}
}
void

View File

@ -456,12 +456,14 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
/*
* There should be exactly two holds, both from
* dsl_dataset_destroy: one on the dd directory, and one on its
* head ds. Otherwise, someone is trying to lookup something
* inside this dir while we want to destroy it. The
* config_rwlock ensures that nobody else opens it after we
* check.
* head ds. If there are more holds, then a concurrent thread is
* performing a lookup inside this dir while we're trying to destroy
* it. To minimize this possibility, we perform this check only
* in syncing context and fail the operation if we encounter
* additional holds. The dp_config_rwlock ensures that nobody else
* opens it after we check.
*/
if (dmu_buf_refcount(dd->dd_dbuf) > 2)
if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2)
return (EBUSY);
err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);

View File

@ -230,12 +230,7 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp,
dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx)
{
dsl_sync_task_group_t *dstg;
if (!spa_writeable(dp->dp_spa))
return;
dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_group_t *dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_create(dstg, checkfunc, syncfunc,
arg1, arg2, blocks_modified);
dsl_sync_task_group_nowait(dstg, tx);

View File

@ -303,7 +303,7 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
dmu_tx_t *tx;
nvlist_t *nvarg;
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
return (EINVAL);
tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@ -439,8 +439,9 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
/*
* If this is part of creating a pool, not everything is
* initialized yet, so don't bother logging the internal events.
* Likewise if the pool is not writeable.
*/
if (tx->tx_txg == TXG_INITIAL) {
if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
fnvlist_free(nvl);
return;
}

View File

@ -1600,6 +1600,18 @@ spa_init(int mode)
spa_mode_global = mode;
#ifndef _KERNEL
if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
arc_procfd = open("/proc/self/ctl", O_WRONLY);
if (arc_procfd == -1) {
perror("could not enable watchpoints: "
"opening /proc/self/ctl failed: ");
} else {
arc_watch = B_TRUE;
}
}
#endif
refcount_init();
unique_init();
zio_init();

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_ARC_H
@ -135,6 +136,11 @@ void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
#ifndef _KERNEL
extern boolean_t arc_watch;
extern int arc_procfd;
#endif
#ifdef __cplusplus
}
#endif

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_DNODE_H
@ -276,7 +277,6 @@ void dnode_byteswap(dnode_phys_t *dnp);
void dnode_buf_byteswap(void *buf, size_t size);
void dnode_verify(dnode_t *dn);
int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
uint64_t dnode_current_max_length(dnode_t *dn);
void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
void dnode_clear_range(dnode_t *dn, uint64_t blkid,
uint64_t nblks, dmu_tx_t *tx);

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
@ -75,6 +76,10 @@ extern void zfs_dbgmsg_init(void);
extern void zfs_dbgmsg_fini(void);
extern void zfs_dbgmsg(const char *fmt, ...);
#ifndef _KERNEL
extern int dprintf_find_string(const char *string);
#endif
#ifdef __cplusplus
}
#endif

View File

@ -4335,7 +4335,17 @@ zfs_ioc_pool_reopen(zfs_cmd_t *zc)
return (error);
spa_vdev_state_enter(spa, SCL_NONE);
/*
* If a resilver is already in progress then set the
* spa_scrub_reopen flag to B_TRUE so that we don't restart
* the scan as a side effect of the reopen. Otherwise, let
* vdev_open() decided if a resilver is required.
*/
spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool);
vdev_reopen(spa->spa_root_vdev);
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);
spa_close(spa, FTAG);
return (0);

View File

@ -125,11 +125,21 @@ zio_init(void)
while (p2 & (p2 - 1))
p2 &= p2 - 1;
#ifndef _KERNEL
/*
* If we are using watchpoints, put each buffer on its own page,
* to eliminate the performance overhead of trapping to the
* kernel when modifying a non-watched buffer that shares the
* page with a watched buffer.
*/
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
continue;
#endif
if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE;
} else if (P2PHASE(size, PAGESIZE) == 0) {
} else if (IS_P2ALIGNED(size, PAGESIZE)) {
align = PAGESIZE;
} else if (P2PHASE(size, p2 >> 2) == 0) {
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
align = p2 >> 2;
}