MFC r207670, r208130, r208131:

MFC r207670:
Introduce hardforce export option (-F) for "zpool export".
When exporting with this flag, zpool.cache remains untouched.
OpenSolaris onnv revision: 8211:32722be6ad3b

MFC r208130:
Fix perfomance problem with ZFS prefetch caching [1]
Add statistics for ZFS prefetch (sysctl kstat.zfs.misc.zfetchstats)
OpenSolaris onnv revision: 10474:0e96dd3b905a (partial)

MFC r208131:
Fix deadlock between zfs_dirent_lock and zfs_rmdir
OpenSolaris onnv revision: 11321:506b7043a14c

Reported by:	jhell@dataix.net (private e-mail) [1]
Approved by:	pjd, delphij (mentor)
Obtained from:	OpenSolaris (Bug ID: 6775357, 6859997, 6868951, 6847615)
This commit is contained in:
Martin Matuska 2010-05-19 06:49:52 +00:00
parent 9c6a66a45b
commit bf7a7e9b29
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/stable/8/; revision=208288
14 changed files with 185 additions and 34 deletions

View File

@ -879,17 +879,21 @@ int
zpool_do_export(int argc, char **argv)
{
boolean_t force = B_FALSE;
boolean_t hardforce = B_FALSE;
int c;
zpool_handle_t *zhp;
int ret;
int i;
/* check options */
while ((c = getopt(argc, argv, "f")) != -1) {
while ((c = getopt(argc, argv, "fF")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
case 'F':
hardforce = B_TRUE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@ -919,8 +923,12 @@ zpool_do_export(int argc, char **argv)
continue;
}
if (zpool_export(zhp, force) != 0)
if (hardforce) {
if (zpool_export_force(zhp) != 0)
ret = 1;
} else if (zpool_export(zhp, force) != 0) {
ret = 1;
}
zpool_close(zhp);
}

View File

@ -3145,7 +3145,7 @@ ztest_spa_import_export(char *oldname, char *newname)
/*
* Export it.
*/
error = spa_export(oldname, &config, B_FALSE);
error = spa_export(oldname, &config, B_FALSE, B_FALSE);
if (error)
fatal(0, "spa_export('%s') = %d", oldname, error);

View File

@ -289,6 +289,7 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
* Import and export functions
*/
extern int zpool_export(zpool_handle_t *, boolean_t);
extern int zpool_export_force(zpool_handle_t *);
extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
char *altroot);
extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,

View File

@ -1096,7 +1096,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
* mounted datasets in the pool.
*/
int
zpool_export(zpool_handle_t *zhp, boolean_t force)
zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
@ -1109,6 +1109,7 @@ zpool_export(zpool_handle_t *zhp, boolean_t force)
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_cookie = force;
zc.zc_guid = hardforce;
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
switch (errno) {
@ -1129,6 +1130,18 @@ zpool_export(zpool_handle_t *zhp, boolean_t force)
return (0);
}
int
zpool_export(zpool_handle_t *zhp, boolean_t force)
{
return (zpool_export_common(zhp, force, B_FALSE));
}
int
zpool_export_force(zpool_handle_t *zhp)
{
return (zpool_export_common(zhp, B_TRUE, B_TRUE));
}
/*
* zpool_import() is a contracted interface. Should be kept the same
* if possible.

View File

@ -1192,6 +1192,7 @@ dmu_init(void)
{
dbuf_init();
dnode_init();
zfetch_init();
arc_init();
l2arc_init();
}
@ -1200,6 +1201,7 @@ void
dmu_fini(void)
{
arc_fini();
zfetch_fini();
dnode_fini();
dbuf_fini();
l2arc_fini();

View File

@ -19,18 +19,17 @@
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
#include <sys/dnode.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_zfetch.h>
#include <sys/dmu.h>
#include <sys/dbuf.h>
#include <sys/kstat.h>
/*
* I'm against tune-ables, but these should probably exist as tweakable globals
@ -77,6 +76,41 @@ static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_colinear_hits;
kstat_named_t zfetchstat_colinear_misses;
kstat_named_t zfetchstat_stride_hits;
kstat_named_t zfetchstat_stride_misses;
kstat_named_t zfetchstat_reclaim_successes;
kstat_named_t zfetchstat_reclaim_failures;
kstat_named_t zfetchstat_stream_resets;
kstat_named_t zfetchstat_stream_noresets;
kstat_named_t zfetchstat_bogus_streams;
} zfetch_stats_t;
static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "colinear_hits", KSTAT_DATA_UINT64 },
{ "colinear_misses", KSTAT_DATA_UINT64 },
{ "stride_hits", KSTAT_DATA_UINT64 },
{ "stride_misses", KSTAT_DATA_UINT64 },
{ "reclaim_successes", KSTAT_DATA_UINT64 },
{ "reclaim_failures", KSTAT_DATA_UINT64 },
{ "streams_resets", KSTAT_DATA_UINT64 },
{ "streams_noresets", KSTAT_DATA_UINT64 },
{ "bogus_streams", KSTAT_DATA_UINT64 },
};
#define ZFETCHSTAT_INCR(stat, val) \
atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1);
kstat_t *zfetch_ksp;
/*
* Given a zfetch structure and a zstream structure, determine whether the
* blocks to be read are part of a co-linear pair of existing prefetch
@ -213,6 +247,29 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
zs->zst_last = LBOLT;
}
void
zfetch_init(void)
{
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (zfetch_ksp != NULL) {
zfetch_ksp->ks_data = &zfetch_stats;
kstat_install(zfetch_ksp);
}
}
void
zfetch_fini(void)
{
if (zfetch_ksp != NULL) {
kstat_delete(zfetch_ksp);
zfetch_ksp = NULL;
}
}
/*
* This takes a pointer to a zfetch structure and a dnode. It performs the
* necessary setup for the zfetch structure, grokking data from the
@ -283,7 +340,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
}
/*
* given a zfetch and a zsearch structure, see if there is an associated zstream
* given a zfetch and a zstream structure, see if there is an associated zstream
* for this block read. If so, it starts a prefetch for the stream it
* located and returns true, otherwise it returns false
*/
@ -315,6 +372,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
*/
if (zs->zst_len == 0) {
/* bogus stream */
ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
continue;
}
@ -324,9 +382,14 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
*/
if (zh->zst_offset >= zs->zst_offset &&
zh->zst_offset < zs->zst_offset + zs->zst_len) {
/* already fetched */
rc = 1;
goto out;
if (prefetched) {
/* already fetched */
ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
rc = 1;
goto out;
} else {
ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
}
}
/*
@ -439,6 +502,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
if (reset) {
zstream_t *remove = zs;
ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
rc = 0;
mutex_exit(&zs->zst_lock);
rw_exit(&zf->zf_rwlock);
@ -457,6 +521,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
}
}
} else {
ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
rc = 1;
dmu_zfetch_dofetch(zf, zs);
mutex_exit(&zs->zst_lock);
@ -513,13 +578,12 @@ dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
zs_next = list_next(&zf->zf_stream, zs_walk);
if (dmu_zfetch_streams_equal(zs_walk, zs)) {
return (0);
return (0);
}
}
list_insert_head(&zf->zf_stream, zs);
zf->zf_stream_cnt++;
return (1);
}
@ -623,8 +687,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
P2ALIGN(offset, blksz)) >> blkshft;
fetched = dmu_zfetch_find(zf, &zst, prefetched);
if (!fetched) {
fetched = dmu_zfetch_colinear(zf, &zst);
if (fetched) {
ZFETCHSTAT_BUMP(zfetchstat_hits);
} else {
ZFETCHSTAT_BUMP(zfetchstat_misses);
if (fetched = dmu_zfetch_colinear(zf, &zst)) {
ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
} else {
ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
}
}
if (!fetched) {
@ -634,11 +705,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
* we still couldn't find a stream, drop the lock, and allocate
* one if possible. Otherwise, give up and go home.
*/
if (newstream == NULL) {
if (newstream) {
ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
} else {
uint64_t maxblocks;
uint32_t max_streams;
uint32_t cur_streams;
ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
cur_streams = zf->zf_stream_cnt;
maxblocks = zf->zf_dnode->dn_maxblkid;
@ -651,7 +725,6 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
if (cur_streams >= max_streams) {
return;
}
newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
}

View File

@ -2564,11 +2564,12 @@ spa_tryimport(nvlist_t *tryconfig)
* The act of destroying or exporting a pool is very simple. We make sure there
* is no more pending I/O and any references to the pool are gone. Then, we
* update the pool state and sync all the labels to disk, removing the
* configuration from the cache afterwards.
* configuration from the cache afterwards. If the 'hardforce' flag is set, then
* we don't sync the labels or remove the configuration cache.
*/
static int
spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force)
boolean_t force, boolean_t hardforce)
{
spa_t *spa;
@ -2636,7 +2637,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
* so mark them all dirty. spa_unload() will do the
* final sync that pushes these changes out.
*/
if (new_state != POOL_STATE_UNINITIALIZED) {
if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_state = new_state;
spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
@ -2656,7 +2657,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
if (new_state != POOL_STATE_UNINITIALIZED) {
spa_config_sync(spa, B_TRUE, B_TRUE);
if (!hardforce)
spa_config_sync(spa, B_TRUE, B_TRUE);
spa_remove(spa);
}
mutex_exit(&spa_namespace_lock);
@ -2670,16 +2672,19 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
int
spa_destroy(char *pool)
{
return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE));
return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
B_FALSE, B_FALSE));
}
/*
* Export a storage pool.
*/
int
spa_export(char *pool, nvlist_t **oldconfig, boolean_t force)
spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce)
{
return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force));
return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
force, hardforce));
}
/*
@ -2690,7 +2695,7 @@ int
spa_reset(char *pool)
{
return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
B_FALSE));
B_FALSE, B_FALSE));
}
/*

View File

@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _DFETCH_H
#define _DFETCH_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
#ifdef __cplusplus
@ -63,6 +61,9 @@ typedef struct zfetch {
uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
} zfetch_t;
void zfetch_init(void);
void zfetch_fini(void);
void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_rele(zfetch_t *);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);

View File

@ -333,7 +333,8 @@ extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce);
extern int spa_reset(char *pool);
extern void spa_async_request(spa_t *spa, int flag);
extern void spa_async_unrequest(spa_t *spa, int flag);

View File

@ -44,6 +44,7 @@ extern "C" {
#define ZRENAMING 0x0010 /* znode is being renamed */
#define ZCILOOK 0x0020 /* case-insensitive lookup requested */
#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */
#define ZHAVELOCK 0x0080 /* z_name_lock is already held */
/* mknode flags */
#define IS_ROOT_NODE 0x01 /* create a root node */

View File

@ -174,6 +174,7 @@ typedef struct znode_phys {
typedef struct zfs_dirlock {
char *dl_name; /* directory entry being locked */
uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */
uint16_t dl_namesize; /* set if dl_name was allocated */
kcondvar_t dl_cv; /* wait for entry to be unlocked */
struct znode *dl_dzp; /* directory znode */

View File

@ -114,6 +114,8 @@ zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
* ZCIEXACT: On a purely case-insensitive file system,
* this lookup should be case-sensitive.
* ZRENAMING: we are locking for renaming, force narrow locks
* ZHAVELOCK: Don't grab the z_name_lock for this call. The
* current thread already holds it.
*
* Output arguments:
* zpp - pointer to the znode for the entry (NULL if there isn't one)
@ -208,13 +210,20 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
/*
* Wait until there are no locks on this name.
*
* Don't grab the the lock if it is already held. However, cannot
* have both ZSHARED and ZHAVELOCK together.
*/
rw_enter(&dzp->z_name_lock, RW_READER);
ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
if (!(flag & ZHAVELOCK))
rw_enter(&dzp->z_name_lock, RW_READER);
mutex_enter(&dzp->z_lock);
for (;;) {
if (dzp->z_unlinked) {
mutex_exit(&dzp->z_lock);
rw_exit(&dzp->z_name_lock);
if (!(flag & ZHAVELOCK))
rw_exit(&dzp->z_name_lock);
return (ENOENT);
}
for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
@ -224,7 +233,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
}
if (error != 0) {
mutex_exit(&dzp->z_lock);
rw_exit(&dzp->z_name_lock);
if (!(flag & ZHAVELOCK))
rw_exit(&dzp->z_name_lock);
return (ENOENT);
}
if (dl == NULL) {
@ -235,6 +245,7 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
dl->dl_name = name;
dl->dl_sharecnt = 0;
dl->dl_namelock = 0;
dl->dl_namesize = 0;
dl->dl_dzp = dzp;
dl->dl_next = dzp->z_dirlocks;
@ -246,6 +257,12 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
cv_wait(&dl->dl_cv, &dzp->z_lock);
}
/*
* If the z_name_lock was NOT held for this dirlock record it.
*/
if (flag & ZHAVELOCK)
dl->dl_namelock = 1;
if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
/*
* We're the second shared reference to dl. Make a copy of
@ -325,7 +342,10 @@ zfs_dirent_unlock(zfs_dirlock_t *dl)
zfs_dirlock_t **prev_dl, *cur_dl;
mutex_enter(&dzp->z_lock);
rw_exit(&dzp->z_name_lock);
if (!dl->dl_namelock)
rw_exit(&dzp->z_name_lock);
if (dl->dl_sharecnt > 1) {
dl->dl_sharecnt--;
mutex_exit(&dzp->z_lock);

View File

@ -882,9 +882,10 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
{
int error;
boolean_t force = (boolean_t)zc->zc_cookie;
boolean_t hardforce = (boolean_t)zc->zc_guid;
zfs_log_history(zc);
error = spa_export(zc->zc_name, NULL, force);
error = spa_export(zc->zc_name, NULL, force, hardforce);
return (error);
}

View File

@ -3208,6 +3208,15 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
}
}
/*
* If the source and destination directories are the same, we should
* grab the z_name_lock of that directory only once.
*/
if (sdzp == tdzp) {
zflg |= ZHAVELOCK;
rw_enter(&sdzp->z_name_lock, RW_READER);
}
if (cmp < 0) {
serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
ZEXISTS | zflg, NULL, NULL);
@ -3230,6 +3239,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
if (tzp)
VN_RELE(ZTOV(tzp));
}
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
serr = EINVAL;
ZFS_EXIT(zfsvfs);
@ -3238,6 +3251,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
if (terr) {
zfs_dirent_unlock(sdl);
VN_RELE(ZTOV(szp));
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
if (strcmp(tnm, "..") == 0)
terr = EINVAL;
ZFS_EXIT(zfsvfs);
@ -3320,6 +3337,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
zfs_rename_unlock(&zl);
zfs_dirent_unlock(sdl);
zfs_dirent_unlock(tdl);
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
VN_RELE(ZTOV(szp));
if (tzp)
VN_RELE(ZTOV(tzp));
@ -3367,6 +3388,9 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
zfs_dirent_unlock(sdl);
zfs_dirent_unlock(tdl);
if (sdzp == tdzp)
rw_exit(&sdzp->z_name_lock);
VN_RELE(ZTOV(szp));
if (tzp)
VN_RELE(ZTOV(tzp));