From 73aeb3d88bbbf8bd3838b440655e31a2677b806c Mon Sep 17 00:00:00 2001 From: Martin Matuska Date: Thu, 8 Nov 2012 01:24:06 +0000 Subject: [PATCH] Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13849:3468a95b27cd (illumos ZFS issues #3145, #3212, #3258) --- cmd/ztest/ztest.c | 99 ++++++++++++++++++++++++------------- uts/common/fs/zfs/arc.c | 86 +++++++++++++++++++++++++++++++- uts/common/fs/zfs/dbuf.c | 19 ++++++- uts/common/fs/zfs/sys/arc.h | 1 + 4 files changed, 169 insertions(+), 36 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 95b8e2ed790a..3da6723459ab 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -119,8 +119,8 @@ #include #include -#define ZTEST_FD_DATA 3 -#define ZTEST_FD_RAND 4 +static int ztest_fd_data = -1; +static int ztest_fd_rand = -1; typedef struct ztest_shared_hdr { uint64_t zh_hdr_size; @@ -708,14 +708,17 @@ process_options(int argc, char **argv) UINT64_MAX >> 2); if (strlen(altdir) > 0) { - char cmd[MAXNAMELEN]; - char realaltdir[MAXNAMELEN]; + char *cmd; + char *realaltdir; char *bin; char *ztest; char *isa; int isalen; - (void) realpath(getexecname(), cmd); + cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + VERIFY(NULL != realpath(getexecname(), cmd)); if (0 != access(altdir, F_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest path: %s", @@ -746,6 +749,9 @@ process_options(int argc, char **argv) fatal(B_TRUE, "invalid alternate lib directory %s", zo->zo_alt_libpath); } + + umem_free(cmd, MAXPATHLEN); + umem_free(realaltdir, MAXPATHLEN); } } @@ -762,10 +768,12 @@ ztest_random(uint64_t range) { uint64_t r; + ASSERT3S(ztest_fd_rand, >=, 0); + if (range == 0) return (0); - if (read(ZTEST_FD_RAND, &r, sizeof (r)) != sizeof (r)) + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(1, "short read from /dev/urandom"); return (r % range); @@ -4695,7 +4703,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) if (islog) (void) rw_unlock(&ztest_name_lock); } else { + /* + * Ideally we would like to be able to randomly + * call vdev_[on|off]line without holding locks + * to force unpredictable failures but the side + * effects of vdev_[on|off]line prevent us from + * doing so. We grab the ztest_vdev_lock here to + * prevent a race between injection testing and + * aux_vdev removal. + */ + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); (void) vdev_online(spa, guid0, 0, NULL); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } } @@ -5650,20 +5669,16 @@ ztest_init(ztest_shared_t *zs) } static void -setup_fds(void) +setup_data_fd(void) { - int fd; + static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; - char *tmp = tempnam(NULL, NULL); - fd = open(tmp, O_RDWR | O_CREAT, 0700); - ASSERT3U(fd, ==, ZTEST_FD_DATA); - (void) unlink(tmp); - free(tmp); - - fd = open("/dev/urandom", O_RDONLY); - ASSERT3U(fd, ==, ZTEST_FD_RAND); + ztest_fd_data = mkstemp(ztest_name_data); + ASSERT3S(ztest_fd_data, >=, 0); + (void) unlink(ztest_name_data); } + static int shared_data_size(ztest_shared_hdr_t *hdr) { @@ -5685,10 +5700,10 @@ setup_hdr(void) ztest_shared_hdr_t *hdr; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0); + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); - VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, sizeof (ztest_shared_hdr_t))); + VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); @@ -5699,7 +5714,7 @@ setup_hdr(void) hdr->zh_ds_count = ztest_opts.zo_datasets; size = shared_data_size(hdr); - VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, size)); + VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } @@ -5712,14 +5727,14 @@ setup_data(void) uint8_t *buf; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), - PROT_READ, MAP_SHARED, ZTEST_FD_DATA, 0); + PROT_READ, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0); + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT(hdr != MAP_FAILED); buf = (uint8_t *)hdr; @@ -5738,12 +5753,13 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) { pid_t pid; int status; - char cmdbuf[MAXPATHLEN]; + char *cmdbuf = NULL; pid = fork(); if (cmd == NULL) { - (void) strlcpy(cmdbuf, getexecname(), sizeof (cmdbuf)); + cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); cmd = cmdbuf; } @@ -5752,9 +5768,16 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) if (pid == 0) { /* child */ char *emptyargv[2] = { cmd, NULL }; + char fd_data_str[12]; struct rlimit rl = { 1024, 1024 }; (void) setrlimit(RLIMIT_NOFILE, &rl); + + (void) close(ztest_fd_rand); + VERIFY3U(11, >=, + snprintf(fd_data_str, 12, "%d", ztest_fd_data)); + VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); + (void) enable_extended_FILE_stdio(-1, -1); if (libpath != NULL) VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); @@ -5763,6 +5786,11 @@ exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) fatal(B_TRUE, "exec failed: %s", cmd); } + if (cmdbuf != NULL) { + umem_free(cmdbuf, MAXPATHLEN); + cmd = NULL; + } + while (waitpid(pid, &status, 0) != pid) continue; if (statusp != NULL) @@ -5827,39 +5855,41 @@ main(int argc, char **argv) char timebuf[100]; char numbuf[6]; spa_t *spa; - char cmd[MAXNAMELEN]; + char *cmd; boolean_t hasalt; - - boolean_t ischild = (0 == lseek(ZTEST_FD_DATA, 0, SEEK_CUR)); - ASSERT(ischild || errno == EBADF); + char *fd_data_str = getenv("ZTEST_FD_DATA"); (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); - if (!ischild) { + ztest_fd_rand = open("/dev/urandom", O_RDONLY); + ASSERT3S(ztest_fd_rand, >=, 0); + + if (!fd_data_str) { process_options(argc, argv); - setup_fds(); + setup_data_fd(); setup_hdr(); setup_data(); bcopy(&ztest_opts, ztest_shared_opts, sizeof (*ztest_shared_opts)); } else { + ztest_fd_data = atoi(fd_data_str); setup_data(); bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); /* Override location of zpool.cache */ - (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", - ztest_opts.zo_dir); + VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", + ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); zs = ztest_shared; - if (ischild) { + if (fd_data_str) { metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; @@ -5882,7 +5912,8 @@ main(int argc, char **argv) (u_longlong_t)ztest_opts.zo_time); } - (void) strlcpy(cmd, getexecname(), sizeof (cmd)); + cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); + (void) strlcpy(cmd, getexecname(), MAXNAMELEN); zs->zs_do_init = B_TRUE; if (strlen(ztest_opts.zo_alt_ztest) != 0) { @@ -6023,5 +6054,7 @@ main(int argc, char **argv) kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } + umem_free(cmd, MAXNAMELEN); + return (0); } diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index e6a2df18e98d..5b0e464ac586 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -188,6 +188,7 @@ uint64_t zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_disable_dup_eviction = 0; /* * Note that buffers can be in one of 6 states: @@ -290,6 +291,9 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; } arc_stats_t; static arc_stats_t arc_stats = { @@ -345,7 +349,10 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -1360,6 +1367,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1458,6 +1476,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1641,6 +1669,48 @@ arc_buf_size(arc_buf_t *buf) return (buf->b_hdr->b_size); } +/* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + /* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. @@ -2626,8 +2696,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -3166,6 +3238,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c index a6048bb51178..ac9e3b28f66b 100644 --- a/uts/common/fs/zfs/dbuf.c +++ b/uts/common/fs/zfs/dbuf.c @@ -2089,7 +2089,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) + + /* + * A dbuf will be eligible for eviction if either the + * 'primarycache' property is set or a duplicate + * copy of this buffer is already cached in the arc. + * + * In the case of the 'primarycache' a buffer + * is considered for eviction if it matches the + * criteria set in the property. + * + * To decide if our buffer is considered a + * duplicate, we must call into the arc to determine + * if multiple buffers are referencing the same + * block on-disk. If so, then we simply evict + * ourselves. + */ + if (!DBUF_IS_CACHEABLE(db) || + arc_buf_eviction_needed(db->db_buf)) dbuf_clear(db); else mutex_exit(&db->db_mtx); diff --git a/uts/common/fs/zfs/sys/arc.h b/uts/common/fs/zfs/sys/arc.h index 28dbc57275cb..b109dcafbcc7 100644 --- a/uts/common/fs/zfs/sys/arc.h +++ b/uts/common/fs/zfs/sys/arc.h @@ -99,6 +99,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); +boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif