14b5719f6a
due to zl_itx_list_sz not updated when async itx'es upgraded to sync. Actually because of other changes about that time zl_itx_list_sz is not really required to implement the functionality, so this patch removes some unneeded broken code and variables. Original idea of zil_slog_limit was to reduce chance of SLOG abuse by single heavy logger, that increased latency for other (more latency critical) loggers, by pushing heavy log out into the main pool instead of SLOG. Beside huge latency increase for heavy writers, this implementation caused double write of all data, since the log records were explicitly prepared for SLOG. Since we now have I/O scheduler, I've found it can be much more efficient to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG. Existing ZIL implementation had problem with space efficiency when it has to write large chunks of data into log blocks of limited size. In some cases efficiency stopped to almost as low as 50%. In case of ZIL stored on spinning rust, that also reduced log write speed in half, since head had to uselessly fly over allocated but not written areas. This change improves the situation by offloading problematic operations from z*_log_write() to zil_lwb_commit(), which knows real situation of log blocks allocation and can split large requests into pieces much more efficiently. Also as side effect it removes one of two data copy operations done by ZIL code WR_COPIED case. While there, untangle and unify code of z*_log_write() functions. Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing block boundary, that may also improve efficiency if ZPL is made to do that. Sponsored by: iXsystems, Inc.
3225 lines
76 KiB
C
3225 lines
76 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
*
|
|
* Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
|
|
* All rights reserved.
|
|
*
|
|
* Portions Copyright 2010 Robert Milkowski
|
|
*
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
|
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
|
* Copyright (c) 2014 Integros [integros.com]
|
|
*/
|
|
|
|
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
|
|
|
|
/*
|
|
* ZFS volume emulation driver.
|
|
*
|
|
* Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
|
|
* Volumes are accessed through the symbolic links named:
|
|
*
|
|
* /dev/zvol/dsk/<pool_name>/<dataset_name>
|
|
* /dev/zvol/rdsk/<pool_name>/<dataset_name>
|
|
*
|
|
* These links are created by the /dev filesystem (sdev_zvolops.c).
|
|
* Volumes are persistent through reboot. No user command needs to be
|
|
* run before opening and using a device.
|
|
*
|
|
* FreeBSD notes.
|
|
* On FreeBSD ZVOLs are simply GEOM providers like any other storage device
|
|
* in the system.
|
|
*/
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/param.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/errno.h>
|
|
#include <sys/uio.h>
|
|
#include <sys/bio.h>
|
|
#include <sys/buf.h>
|
|
#include <sys/kmem.h>
|
|
#include <sys/conf.h>
|
|
#include <sys/cmn_err.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/zap.h>
|
|
#include <sys/spa.h>
|
|
#include <sys/spa_impl.h>
|
|
#include <sys/zio.h>
|
|
#include <sys/disk.h>
|
|
#include <sys/dmu_traverse.h>
|
|
#include <sys/dnode.h>
|
|
#include <sys/dsl_dataset.h>
|
|
#include <sys/dsl_prop.h>
|
|
#include <sys/dkio.h>
|
|
#include <sys/byteorder.h>
|
|
#include <sys/sunddi.h>
|
|
#include <sys/dirent.h>
|
|
#include <sys/policy.h>
|
|
#include <sys/queue.h>
|
|
#include <sys/fs/zfs.h>
|
|
#include <sys/zfs_ioctl.h>
|
|
#include <sys/zil.h>
|
|
#include <sys/refcount.h>
|
|
#include <sys/zfs_znode.h>
|
|
#include <sys/zfs_rlock.h>
|
|
#include <sys/vdev_impl.h>
|
|
#include <sys/vdev_raidz.h>
|
|
#include <sys/zvol.h>
|
|
#include <sys/zil_impl.h>
|
|
#include <sys/dbuf.h>
|
|
#include <sys/dmu_tx.h>
|
|
#include <sys/zfeature.h>
|
|
#include <sys/zio_checksum.h>
|
|
#include <sys/filio.h>
|
|
|
|
#include <geom/geom.h>
|
|
|
|
#include "zfs_namecheck.h"
|
|
|
|
#ifndef illumos
|
|
struct g_class zfs_zvol_class = {
|
|
.name = "ZFS::ZVOL",
|
|
.version = G_VERSION,
|
|
};
|
|
|
|
DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
|
|
|
|
#endif
|
|
void *zfsdev_state;
|
|
static char *zvol_tag = "zvol_tag";
|
|
|
|
#define ZVOL_DUMPSIZE "dumpsize"
|
|
|
|
/*
|
|
* This lock protects the zfsdev_state structure from being modified
|
|
* while it's being used, e.g. an open that comes in before a create
|
|
* finishes. It also protects temporary opens of the dataset so that,
|
|
* e.g., an open doesn't get a spurious EBUSY.
|
|
*/
|
|
#ifdef illumos
|
|
kmutex_t zfsdev_state_lock;
|
|
#else
|
|
/*
|
|
* In FreeBSD we've replaced the upstream zfsdev_state_lock with the
|
|
* spa_namespace_lock in the ZVOL code.
|
|
*/
|
|
#define zfsdev_state_lock spa_namespace_lock
|
|
#endif
|
|
static uint32_t zvol_minors;
|
|
|
|
#ifndef illumos
|
|
SYSCTL_DECL(_vfs_zfs);
|
|
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
|
|
static int volmode = ZFS_VOLMODE_GEOM;
|
|
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
|
|
"Expose as GEOM providers (1), device files (2) or neither");
|
|
static boolean_t zpool_on_zvol = B_FALSE;
|
|
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
|
|
"Allow zpools to use zvols as vdevs (DANGEROUS)");
|
|
|
|
#endif
|
|
typedef struct zvol_extent {
|
|
list_node_t ze_node;
|
|
dva_t ze_dva; /* dva associated with this extent */
|
|
uint64_t ze_nblks; /* number of blocks in extent */
|
|
} zvol_extent_t;
|
|
|
|
/*
|
|
* The in-core state of each volume.
|
|
*/
|
|
typedef struct zvol_state {
|
|
#ifndef illumos
|
|
LIST_ENTRY(zvol_state) zv_links;
|
|
#endif
|
|
char zv_name[MAXPATHLEN]; /* pool/dd name */
|
|
uint64_t zv_volsize; /* amount of space we advertise */
|
|
uint64_t zv_volblocksize; /* volume block size */
|
|
#ifdef illumos
|
|
minor_t zv_minor; /* minor number */
|
|
#else
|
|
struct cdev *zv_dev; /* non-GEOM device */
|
|
struct g_provider *zv_provider; /* GEOM provider */
|
|
#endif
|
|
uint8_t zv_min_bs; /* minimum addressable block shift */
|
|
uint8_t zv_flags; /* readonly, dumpified, etc. */
|
|
objset_t *zv_objset; /* objset handle */
|
|
#ifdef illumos
|
|
uint32_t zv_open_count[OTYPCNT]; /* open counts */
|
|
#endif
|
|
uint32_t zv_total_opens; /* total open count */
|
|
uint32_t zv_sync_cnt; /* synchronous open count */
|
|
zilog_t *zv_zilog; /* ZIL handle */
|
|
list_t zv_extents; /* List of extents for dump */
|
|
znode_t zv_znode; /* for range locking */
|
|
dmu_buf_t *zv_dbuf; /* bonus handle */
|
|
#ifndef illumos
|
|
int zv_state;
|
|
int zv_volmode; /* Provide GEOM or cdev */
|
|
struct bio_queue_head zv_queue;
|
|
struct mtx zv_queue_mtx; /* zv_queue mutex */
|
|
#endif
|
|
} zvol_state_t;
|
|
|
|
#ifndef illumos
|
|
static LIST_HEAD(, zvol_state) all_zvols;
|
|
#endif
|
|
/*
|
|
* zvol specific flags
|
|
*/
|
|
#define ZVOL_RDONLY 0x1
|
|
#define ZVOL_DUMPIFIED 0x2
|
|
#define ZVOL_EXCL 0x4
|
|
#define ZVOL_WCE 0x8
|
|
|
|
/*
|
|
* zvol maximum transfer in one DMU tx.
|
|
*/
|
|
int zvol_maxphys = DMU_MAX_ACCESS/2;
|
|
|
|
/*
|
|
* Toggle unmap functionality.
|
|
*/
|
|
boolean_t zvol_unmap_enabled = B_TRUE;
|
|
#ifndef illumos
|
|
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
|
|
&zvol_unmap_enabled, 0,
|
|
"Enable UNMAP functionality");
|
|
|
|
static d_open_t zvol_d_open;
|
|
static d_close_t zvol_d_close;
|
|
static d_read_t zvol_read;
|
|
static d_write_t zvol_write;
|
|
static d_ioctl_t zvol_d_ioctl;
|
|
static d_strategy_t zvol_strategy;
|
|
|
|
static struct cdevsw zvol_cdevsw = {
|
|
.d_version = D_VERSION,
|
|
.d_open = zvol_d_open,
|
|
.d_close = zvol_d_close,
|
|
.d_read = zvol_read,
|
|
.d_write = zvol_write,
|
|
.d_ioctl = zvol_d_ioctl,
|
|
.d_strategy = zvol_strategy,
|
|
.d_name = "zvol",
|
|
.d_flags = D_DISK | D_TRACKCLOSE,
|
|
};
|
|
|
|
static void zvol_geom_run(zvol_state_t *zv);
|
|
static void zvol_geom_destroy(zvol_state_t *zv);
|
|
static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
|
|
static void zvol_geom_start(struct bio *bp);
|
|
static void zvol_geom_worker(void *arg);
|
|
static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
|
|
uint64_t len, boolean_t sync);
|
|
#endif /* !illumos */
|
|
|
|
extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
|
|
nvlist_t *, nvlist_t *);
|
|
static int zvol_remove_zv(zvol_state_t *);
|
|
static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
|
|
static int zvol_dumpify(zvol_state_t *zv);
|
|
static int zvol_dump_fini(zvol_state_t *zv);
|
|
static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
|
|
|
|
static void
|
|
zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
|
|
{
|
|
#ifdef illumos
|
|
dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
|
|
|
|
zv->zv_volsize = volsize;
|
|
VERIFY(ddi_prop_update_int64(dev, zfs_dip,
|
|
"Size", volsize) == DDI_SUCCESS);
|
|
VERIFY(ddi_prop_update_int64(dev, zfs_dip,
|
|
"Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
|
|
|
|
/* Notify specfs to invalidate the cached size */
|
|
spec_size_invalidate(dev, VBLK);
|
|
spec_size_invalidate(dev, VCHR);
|
|
#else /* !illumos */
|
|
zv->zv_volsize = volsize;
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
|
struct g_provider *pp;
|
|
|
|
pp = zv->zv_provider;
|
|
if (pp == NULL)
|
|
return;
|
|
g_topology_lock();
|
|
g_resize_provider(pp, zv->zv_volsize);
|
|
g_topology_unlock();
|
|
}
|
|
#endif /* illumos */
|
|
}
|
|
|
|
int
|
|
zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
|
|
{
|
|
if (volsize == 0)
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
if (volsize % blocksize != 0)
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
#ifdef _ILP32
|
|
if (volsize - 1 > SPEC_MAXOFFSET_T)
|
|
return (SET_ERROR(EOVERFLOW));
|
|
#endif
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
zvol_check_volblocksize(uint64_t volblocksize)
|
|
{
|
|
if (volblocksize < SPA_MINBLOCKSIZE ||
|
|
volblocksize > SPA_OLD_MAXBLOCKSIZE ||
|
|
!ISP2(volblocksize))
|
|
return (SET_ERROR(EDOM));
|
|
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
zvol_get_stats(objset_t *os, nvlist_t *nv)
|
|
{
|
|
int error;
|
|
dmu_object_info_t doi;
|
|
uint64_t val;
|
|
|
|
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
|
|
if (error)
|
|
return (error);
|
|
|
|
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
|
|
|
|
error = dmu_object_info(os, ZVOL_OBJ, &doi);
|
|
|
|
if (error == 0) {
|
|
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
|
|
doi.doi_data_block_size);
|
|
}
|
|
|
|
return (error);
|
|
}
|
|
|
|
static zvol_state_t *
|
|
zvol_minor_lookup(const char *name)
|
|
{
|
|
#ifdef illumos
|
|
minor_t minor;
|
|
#endif
|
|
zvol_state_t *zv;
|
|
|
|
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
|
|
|
#ifdef illumos
|
|
for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
|
|
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
|
|
if (zv == NULL)
|
|
continue;
|
|
#else
|
|
LIST_FOREACH(zv, &all_zvols, zv_links) {
|
|
#endif
|
|
if (strcmp(zv->zv_name, name) == 0)
|
|
return (zv);
|
|
}
|
|
|
|
return (NULL);
|
|
}
|
|
|
|
/* extent mapping arg */
|
|
struct maparg {
|
|
zvol_state_t *ma_zv;
|
|
uint64_t ma_blks;
|
|
};
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
|
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
|
|
{
|
|
struct maparg *ma = arg;
|
|
zvol_extent_t *ze;
|
|
int bs = ma->ma_zv->zv_volblocksize;
|
|
|
|
if (bp == NULL || BP_IS_HOLE(bp) ||
|
|
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
|
|
return (0);
|
|
|
|
VERIFY(!BP_IS_EMBEDDED(bp));
|
|
|
|
VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
|
|
ma->ma_blks++;
|
|
|
|
/* Abort immediately if we have encountered gang blocks */
|
|
if (BP_IS_GANG(bp))
|
|
return (SET_ERROR(EFRAGS));
|
|
|
|
/*
|
|
* See if the block is at the end of the previous extent.
|
|
*/
|
|
ze = list_tail(&ma->ma_zv->zv_extents);
|
|
if (ze &&
|
|
DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
|
|
DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
|
|
DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
|
|
ze->ze_nblks++;
|
|
return (0);
|
|
}
|
|
|
|
dprintf_bp(bp, "%s", "next blkptr:");
|
|
|
|
/* start a new extent */
|
|
ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
|
|
ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
|
|
ze->ze_nblks = 1;
|
|
list_insert_tail(&ma->ma_zv->zv_extents, ze);
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
zvol_free_extents(zvol_state_t *zv)
|
|
{
|
|
zvol_extent_t *ze;
|
|
|
|
while (ze = list_head(&zv->zv_extents)) {
|
|
list_remove(&zv->zv_extents, ze);
|
|
kmem_free(ze, sizeof (zvol_extent_t));
|
|
}
|
|
}
|
|
|
|
static int
|
|
zvol_get_lbas(zvol_state_t *zv)
|
|
{
|
|
objset_t *os = zv->zv_objset;
|
|
struct maparg ma;
|
|
int err;
|
|
|
|
ma.ma_zv = zv;
|
|
ma.ma_blks = 0;
|
|
zvol_free_extents(zv);
|
|
|
|
/* commit any in-flight changes before traversing the dataset */
|
|
txg_wait_synced(dmu_objset_pool(os), 0);
|
|
err = traverse_dataset(dmu_objset_ds(os), 0,
|
|
TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
|
|
if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
|
|
zvol_free_extents(zv);
|
|
return (err ? err : EIO);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
void
|
|
zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
|
|
{
|
|
zfs_creat_t *zct = arg;
|
|
nvlist_t *nvprops = zct->zct_props;
|
|
int error;
|
|
uint64_t volblocksize, volsize;
|
|
|
|
VERIFY(nvlist_lookup_uint64(nvprops,
|
|
zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
|
|
if (nvlist_lookup_uint64(nvprops,
|
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
|
|
volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
|
|
|
|
/*
|
|
* These properties must be removed from the list so the generic
|
|
* property setting step won't apply to them.
|
|
*/
|
|
VERIFY(nvlist_remove_all(nvprops,
|
|
zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
|
|
(void) nvlist_remove_all(nvprops,
|
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
|
|
|
|
error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
|
|
DMU_OT_NONE, 0, tx);
|
|
ASSERT(error == 0);
|
|
|
|
error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
|
|
DMU_OT_NONE, 0, tx);
|
|
ASSERT(error == 0);
|
|
|
|
error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
|
|
ASSERT(error == 0);
|
|
}
|
|
|
|
/*
|
|
* Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
|
|
* implement DKIOCFREE/free-long-range.
|
|
*/
|
|
static int
|
|
zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
|
|
{
|
|
uint64_t offset, length;
|
|
|
|
if (byteswap)
|
|
byteswap_uint64_array(lr, sizeof (*lr));
|
|
|
|
offset = lr->lr_offset;
|
|
length = lr->lr_length;
|
|
|
|
return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
|
|
}
|
|
|
|
/*
|
|
* Replay a TX_WRITE ZIL transaction that didn't get committed
|
|
* after a system failure
|
|
*/
|
|
static int
|
|
zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
|
|
{
|
|
objset_t *os = zv->zv_objset;
|
|
char *data = (char *)(lr + 1); /* data follows lr_write_t */
|
|
uint64_t offset, length;
|
|
dmu_tx_t *tx;
|
|
int error;
|
|
|
|
if (byteswap)
|
|
byteswap_uint64_array(lr, sizeof (*lr));
|
|
|
|
offset = lr->lr_offset;
|
|
length = lr->lr_length;
|
|
|
|
/* If it's a dmu_sync() block, write the whole block */
|
|
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
|
|
uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
|
|
if (length < blocksize) {
|
|
offset -= offset % blocksize;
|
|
length = blocksize;
|
|
}
|
|
}
|
|
|
|
tx = dmu_tx_create(os);
|
|
dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
} else {
|
|
dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
|
|
dmu_tx_commit(tx);
|
|
}
|
|
|
|
return (error);
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
static int
|
|
zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
|
|
{
|
|
return (SET_ERROR(ENOTSUP));
|
|
}
|
|
|
|
/*
|
|
* Callback vectors for replaying records.
|
|
* Only TX_WRITE and TX_TRUNCATE are needed for zvol.
|
|
*/
|
|
zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
|
|
zvol_replay_err, /* 0 no such transaction type */
|
|
zvol_replay_err, /* TX_CREATE */
|
|
zvol_replay_err, /* TX_MKDIR */
|
|
zvol_replay_err, /* TX_MKXATTR */
|
|
zvol_replay_err, /* TX_SYMLINK */
|
|
zvol_replay_err, /* TX_REMOVE */
|
|
zvol_replay_err, /* TX_RMDIR */
|
|
zvol_replay_err, /* TX_LINK */
|
|
zvol_replay_err, /* TX_RENAME */
|
|
zvol_replay_write, /* TX_WRITE */
|
|
zvol_replay_truncate, /* TX_TRUNCATE */
|
|
zvol_replay_err, /* TX_SETATTR */
|
|
zvol_replay_err, /* TX_ACL */
|
|
zvol_replay_err, /* TX_CREATE_ACL */
|
|
zvol_replay_err, /* TX_CREATE_ATTR */
|
|
zvol_replay_err, /* TX_CREATE_ACL_ATTR */
|
|
zvol_replay_err, /* TX_MKDIR_ACL */
|
|
zvol_replay_err, /* TX_MKDIR_ATTR */
|
|
zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
|
|
zvol_replay_err, /* TX_WRITE2 */
|
|
};
|
|
|
|
#ifdef illumos
|
|
int
|
|
zvol_name2minor(const char *name, minor_t *minor)
|
|
{
|
|
zvol_state_t *zv;
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
zv = zvol_minor_lookup(name);
|
|
if (minor && zv)
|
|
*minor = zv->zv_minor;
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (zv ? 0 : -1);
|
|
}
|
|
#endif /* illumos */
|
|
|
|
/*
|
|
* Create a minor node (plus a whole lot more) for the specified volume.
|
|
*/
|
|
int
|
|
zvol_create_minor(const char *name)
|
|
{
|
|
zfs_soft_state_t *zs;
|
|
zvol_state_t *zv;
|
|
objset_t *os;
|
|
dmu_object_info_t doi;
|
|
#ifdef illumos
|
|
minor_t minor = 0;
|
|
char chrbuf[30], blkbuf[30];
|
|
#else
|
|
struct g_provider *pp;
|
|
struct g_geom *gp;
|
|
uint64_t volsize, mode;
|
|
#endif
|
|
int error;
|
|
|
|
#ifndef illumos
|
|
ZFS_LOG(1, "Creating ZVOL %s...", name);
|
|
#endif
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
|
|
if (zvol_minor_lookup(name) != NULL) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(EEXIST));
|
|
}
|
|
|
|
/* lie and say we're read-only */
|
|
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
|
|
|
|
if (error) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (error);
|
|
}
|
|
|
|
#ifdef illumos
|
|
if ((minor = zfsdev_minor_alloc()) == 0) {
|
|
dmu_objset_disown(os, FTAG);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(ENXIO));
|
|
}
|
|
|
|
if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
|
|
dmu_objset_disown(os, FTAG);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(EAGAIN));
|
|
}
|
|
(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
|
|
(char *)name);
|
|
|
|
(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
|
|
|
|
if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
|
|
minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
|
|
ddi_soft_state_free(zfsdev_state, minor);
|
|
dmu_objset_disown(os, FTAG);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(EAGAIN));
|
|
}
|
|
|
|
(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
|
|
|
|
if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
|
|
minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
|
|
ddi_remove_minor_node(zfs_dip, chrbuf);
|
|
ddi_soft_state_free(zfsdev_state, minor);
|
|
dmu_objset_disown(os, FTAG);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(EAGAIN));
|
|
}
|
|
|
|
zs = ddi_get_soft_state(zfsdev_state, minor);
|
|
zs->zss_type = ZSST_ZVOL;
|
|
zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
|
|
#else /* !illumos */
|
|
|
|
zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
|
|
zv->zv_state = 0;
|
|
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
|
|
if (error) {
|
|
kmem_free(zv, sizeof(*zv));
|
|
dmu_objset_disown(os, zvol_tag);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (error);
|
|
}
|
|
error = dsl_prop_get_integer(name,
|
|
zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
|
|
if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
|
|
mode = volmode;
|
|
|
|
DROP_GIANT();
|
|
zv->zv_volsize = volsize;
|
|
zv->zv_volmode = mode;
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
|
g_topology_lock();
|
|
gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
|
|
gp->start = zvol_geom_start;
|
|
gp->access = zvol_geom_access;
|
|
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
|
|
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
|
|
pp->sectorsize = DEV_BSIZE;
|
|
pp->mediasize = zv->zv_volsize;
|
|
pp->private = zv;
|
|
|
|
zv->zv_provider = pp;
|
|
bioq_init(&zv->zv_queue);
|
|
mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
|
|
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
|
struct make_dev_args args;
|
|
|
|
make_dev_args_init(&args);
|
|
args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
|
|
args.mda_devsw = &zvol_cdevsw;
|
|
args.mda_cr = NULL;
|
|
args.mda_uid = UID_ROOT;
|
|
args.mda_gid = GID_OPERATOR;
|
|
args.mda_mode = 0640;
|
|
args.mda_si_drv2 = zv;
|
|
error = make_dev_s(&args, &zv->zv_dev,
|
|
"%s/%s", ZVOL_DRIVER, name);
|
|
if (error != 0) {
|
|
kmem_free(zv, sizeof(*zv));
|
|
dmu_objset_disown(os, FTAG);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (error);
|
|
}
|
|
zv->zv_dev->si_iosize_max = MAXPHYS;
|
|
}
|
|
LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
|
|
#endif /* illumos */
|
|
|
|
(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
|
|
zv->zv_min_bs = DEV_BSHIFT;
|
|
#ifdef illumos
|
|
zv->zv_minor = minor;
|
|
#endif
|
|
zv->zv_objset = os;
|
|
if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
|
|
zv->zv_flags |= ZVOL_RDONLY;
|
|
mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
|
|
sizeof (rl_t), offsetof(rl_t, r_node));
|
|
list_create(&zv->zv_extents, sizeof (zvol_extent_t),
|
|
offsetof(zvol_extent_t, ze_node));
|
|
/* get and cache the blocksize */
|
|
error = dmu_object_info(os, ZVOL_OBJ, &doi);
|
|
ASSERT(error == 0);
|
|
zv->zv_volblocksize = doi.doi_data_block_size;
|
|
|
|
if (spa_writeable(dmu_objset_spa(os))) {
|
|
if (zil_replay_disable)
|
|
zil_destroy(dmu_objset_zil(os), B_FALSE);
|
|
else
|
|
zil_replay(os, zv, zvol_replay_vector);
|
|
}
|
|
dmu_objset_disown(os, FTAG);
|
|
zv->zv_objset = NULL;
|
|
|
|
zvol_minors++;
|
|
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#ifndef illumos
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
|
zvol_geom_run(zv);
|
|
g_topology_unlock();
|
|
}
|
|
PICKUP_GIANT();
|
|
|
|
ZFS_LOG(1, "ZVOL %s created.", name);
|
|
#endif
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Remove minor node for the specified volume.
|
|
*/
|
|
static int
|
|
zvol_remove_zv(zvol_state_t *zv)
|
|
{
|
|
#ifdef illumos
|
|
char nmbuf[20];
|
|
minor_t minor = zv->zv_minor;
|
|
#endif
|
|
|
|
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
|
if (zv->zv_total_opens != 0)
|
|
return (SET_ERROR(EBUSY));
|
|
|
|
#ifdef illumos
|
|
(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
|
|
ddi_remove_minor_node(zfs_dip, nmbuf);
|
|
|
|
(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
|
|
ddi_remove_minor_node(zfs_dip, nmbuf);
|
|
#else
|
|
ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
|
|
|
|
LIST_REMOVE(zv, zv_links);
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
|
g_topology_lock();
|
|
zvol_geom_destroy(zv);
|
|
g_topology_unlock();
|
|
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
|
if (zv->zv_dev != NULL)
|
|
destroy_dev(zv->zv_dev);
|
|
}
|
|
#endif
|
|
|
|
avl_destroy(&zv->zv_znode.z_range_avl);
|
|
mutex_destroy(&zv->zv_znode.z_range_lock);
|
|
|
|
kmem_free(zv, sizeof (zvol_state_t));
|
|
#ifdef illumos
|
|
ddi_soft_state_free(zfsdev_state, minor);
|
|
#endif
|
|
zvol_minors--;
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
zvol_remove_minor(const char *name)
|
|
{
|
|
zvol_state_t *zv;
|
|
int rc;
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
if ((zv = zvol_minor_lookup(name)) == NULL) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(ENXIO));
|
|
}
|
|
rc = zvol_remove_zv(zv);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (rc);
|
|
}
|
|
|
|
int
|
|
zvol_first_open(zvol_state_t *zv)
|
|
{
|
|
objset_t *os;
|
|
uint64_t volsize;
|
|
int error;
|
|
uint64_t readonly;
|
|
|
|
/* lie and say we're read-only */
|
|
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
|
|
zvol_tag, &os);
|
|
if (error)
|
|
return (error);
|
|
|
|
zv->zv_objset = os;
|
|
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
|
|
if (error) {
|
|
ASSERT(error == 0);
|
|
dmu_objset_disown(os, zvol_tag);
|
|
return (error);
|
|
}
|
|
|
|
error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
|
|
if (error) {
|
|
dmu_objset_disown(os, zvol_tag);
|
|
return (error);
|
|
}
|
|
|
|
zvol_size_changed(zv, volsize);
|
|
zv->zv_zilog = zil_open(os, zvol_get_data);
|
|
|
|
VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
|
|
NULL) == 0);
|
|
if (readonly || dmu_objset_is_snapshot(os) ||
|
|
!spa_writeable(dmu_objset_spa(os)))
|
|
zv->zv_flags |= ZVOL_RDONLY;
|
|
else
|
|
zv->zv_flags &= ~ZVOL_RDONLY;
|
|
return (error);
|
|
}
|
|
|
|
void
|
|
zvol_last_close(zvol_state_t *zv)
|
|
{
|
|
zil_close(zv->zv_zilog);
|
|
zv->zv_zilog = NULL;
|
|
|
|
dmu_buf_rele(zv->zv_dbuf, zvol_tag);
|
|
zv->zv_dbuf = NULL;
|
|
|
|
/*
|
|
* Evict cached data
|
|
*/
|
|
if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
|
|
!(zv->zv_flags & ZVOL_RDONLY))
|
|
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
|
|
dmu_objset_evict_dbufs(zv->zv_objset);
|
|
|
|
dmu_objset_disown(zv->zv_objset, zvol_tag);
|
|
zv->zv_objset = NULL;
|
|
}
|
|
|
|
#ifdef illumos
|
|
int
|
|
zvol_prealloc(zvol_state_t *zv)
|
|
{
|
|
objset_t *os = zv->zv_objset;
|
|
dmu_tx_t *tx;
|
|
uint64_t refd, avail, usedobjs, availobjs;
|
|
uint64_t resid = zv->zv_volsize;
|
|
uint64_t off = 0;
|
|
|
|
/* Check the space usage before attempting to allocate the space */
|
|
dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
|
|
if (avail < zv->zv_volsize)
|
|
return (SET_ERROR(ENOSPC));
|
|
|
|
/* Free old extents if they exist */
|
|
zvol_free_extents(zv);
|
|
|
|
while (resid != 0) {
|
|
int error;
|
|
uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
|
|
|
|
tx = dmu_tx_create(os);
|
|
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
|
|
return (error);
|
|
}
|
|
dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
|
|
dmu_tx_commit(tx);
|
|
off += bytes;
|
|
resid -= bytes;
|
|
}
|
|
txg_wait_synced(dmu_objset_pool(os), 0);
|
|
|
|
return (0);
|
|
}
|
|
#endif /* illumos */
|
|
|
|
static int
|
|
zvol_update_volsize(objset_t *os, uint64_t volsize)
|
|
{
|
|
dmu_tx_t *tx;
|
|
int error;
|
|
|
|
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
|
|
|
tx = dmu_tx_create(os);
|
|
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
|
|
dmu_tx_mark_netfree(tx);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
return (error);
|
|
}
|
|
|
|
error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
|
|
&volsize, tx);
|
|
dmu_tx_commit(tx);
|
|
|
|
if (error == 0)
|
|
error = dmu_free_long_range(os,
|
|
ZVOL_OBJ, volsize, DMU_OBJECT_END);
|
|
return (error);
|
|
}
|
|
|
|
void
|
|
zvol_remove_minors(const char *name)
|
|
{
|
|
#ifdef illumos
|
|
zvol_state_t *zv;
|
|
char *namebuf;
|
|
minor_t minor;
|
|
|
|
namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
|
|
(void) strncpy(namebuf, name, strlen(name));
|
|
(void) strcat(namebuf, "/");
|
|
mutex_enter(&zfsdev_state_lock);
|
|
for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
|
|
|
|
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
|
|
if (zv == NULL)
|
|
continue;
|
|
if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
|
|
(void) zvol_remove_zv(zv);
|
|
}
|
|
kmem_free(namebuf, strlen(name) + 2);
|
|
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#else /* !illumos */
|
|
zvol_state_t *zv, *tzv;
|
|
size_t namelen;
|
|
|
|
namelen = strlen(name);
|
|
|
|
DROP_GIANT();
|
|
mutex_enter(&zfsdev_state_lock);
|
|
|
|
LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
|
|
if (strcmp(zv->zv_name, name) == 0 ||
|
|
(strncmp(zv->zv_name, name, namelen) == 0 &&
|
|
strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
|
|
zv->zv_name[namelen] == '@'))) {
|
|
(void) zvol_remove_zv(zv);
|
|
}
|
|
}
|
|
|
|
mutex_exit(&zfsdev_state_lock);
|
|
PICKUP_GIANT();
|
|
#endif /* illumos */
|
|
}
|
|
|
|
static int
|
|
zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
|
|
{
|
|
uint64_t old_volsize = 0ULL;
|
|
int error = 0;
|
|
|
|
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
|
|
|
/*
|
|
* Reinitialize the dump area to the new size. If we
|
|
* failed to resize the dump area then restore it back to
|
|
* its original size. We must set the new volsize prior
|
|
* to calling dumpvp_resize() to ensure that the devices'
|
|
* size(9P) is not visible by the dump subsystem.
|
|
*/
|
|
old_volsize = zv->zv_volsize;
|
|
zvol_size_changed(zv, volsize);
|
|
|
|
#ifdef ZVOL_DUMP
|
|
if (zv->zv_flags & ZVOL_DUMPIFIED) {
|
|
if ((error = zvol_dumpify(zv)) != 0 ||
|
|
(error = dumpvp_resize()) != 0) {
|
|
int dumpify_error;
|
|
|
|
(void) zvol_update_volsize(zv->zv_objset, old_volsize);
|
|
zvol_size_changed(zv, old_volsize);
|
|
dumpify_error = zvol_dumpify(zv);
|
|
error = dumpify_error ? dumpify_error : error;
|
|
}
|
|
}
|
|
#endif /* ZVOL_DUMP */
|
|
|
|
#ifdef illumos
|
|
/*
|
|
* Generate a LUN expansion event.
|
|
*/
|
|
if (error == 0) {
|
|
sysevent_id_t eid;
|
|
nvlist_t *attr;
|
|
char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
|
|
|
|
(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
|
|
zv->zv_minor);
|
|
|
|
VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
|
|
VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
|
|
|
|
(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
|
|
ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
|
|
|
|
nvlist_free(attr);
|
|
kmem_free(physpath, MAXPATHLEN);
|
|
}
|
|
#endif /* illumos */
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
zvol_set_volsize(const char *name, uint64_t volsize)
|
|
{
|
|
zvol_state_t *zv = NULL;
|
|
objset_t *os;
|
|
int error;
|
|
dmu_object_info_t doi;
|
|
uint64_t readonly;
|
|
boolean_t owned = B_FALSE;
|
|
|
|
error = dsl_prop_get_integer(name,
|
|
zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
|
|
if (error != 0)
|
|
return (error);
|
|
if (readonly)
|
|
return (SET_ERROR(EROFS));
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
zv = zvol_minor_lookup(name);
|
|
|
|
if (zv == NULL || zv->zv_objset == NULL) {
|
|
if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
|
|
FTAG, &os)) != 0) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (error);
|
|
}
|
|
owned = B_TRUE;
|
|
if (zv != NULL)
|
|
zv->zv_objset = os;
|
|
} else {
|
|
os = zv->zv_objset;
|
|
}
|
|
|
|
if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
|
|
(error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
|
|
goto out;
|
|
|
|
error = zvol_update_volsize(os, volsize);
|
|
|
|
if (error == 0 && zv != NULL)
|
|
error = zvol_update_live_volsize(zv, volsize);
|
|
out:
|
|
if (owned) {
|
|
dmu_objset_disown(os, FTAG);
|
|
if (zv != NULL)
|
|
zv->zv_objset = NULL;
|
|
}
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
#ifdef illumos
|
|
int
|
|
zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
|
|
#else
|
|
static int
|
|
zvol_open(struct g_provider *pp, int flag, int count)
|
|
#endif
|
|
{
|
|
zvol_state_t *zv;
|
|
int err = 0;
|
|
#ifdef illumos
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
|
|
zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
|
|
if (zv == NULL) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(ENXIO));
|
|
}
|
|
|
|
if (zv->zv_total_opens == 0)
|
|
err = zvol_first_open(zv);
|
|
if (err) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (err);
|
|
}
|
|
#else /* !illumos */
|
|
boolean_t locked = B_FALSE;
|
|
|
|
if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
|
|
/*
|
|
* if zfs_geom_probe_vdev_key is set, that means that zfs is
|
|
* attempting to probe geom providers while looking for a
|
|
* replacement for a missing VDEV. In this case, the
|
|
* spa_namespace_lock will not be held, but it is still illegal
|
|
* to use a zvol as a vdev. Deadlocks can result if another
|
|
* thread has spa_namespace_lock
|
|
*/
|
|
return (EOPNOTSUPP);
|
|
}
|
|
/*
|
|
* Protect against recursively entering spa_namespace_lock
|
|
* when spa_open() is used for a pool on a (local) ZVOL(s).
|
|
* This is needed since we replaced upstream zfsdev_state_lock
|
|
* with spa_namespace_lock in the ZVOL code.
|
|
* We are using the same trick as spa_open().
|
|
* Note that calls in zvol_first_open which need to resolve
|
|
* pool name to a spa object will enter spa_open()
|
|
* recursively, but that function already has all the
|
|
* necessary protection.
|
|
*/
|
|
if (!MUTEX_HELD(&zfsdev_state_lock)) {
|
|
mutex_enter(&zfsdev_state_lock);
|
|
locked = B_TRUE;
|
|
}
|
|
|
|
zv = pp->private;
|
|
if (zv == NULL) {
|
|
if (locked)
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(ENXIO));
|
|
}
|
|
|
|
if (zv->zv_total_opens == 0) {
|
|
err = zvol_first_open(zv);
|
|
if (err) {
|
|
if (locked)
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (err);
|
|
}
|
|
pp->mediasize = zv->zv_volsize;
|
|
pp->stripeoffset = 0;
|
|
pp->stripesize = zv->zv_volblocksize;
|
|
}
|
|
#endif /* illumos */
|
|
if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
|
|
err = SET_ERROR(EROFS);
|
|
goto out;
|
|
}
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
|
err = SET_ERROR(EBUSY);
|
|
goto out;
|
|
}
|
|
#ifdef FEXCL
|
|
if (flag & FEXCL) {
|
|
if (zv->zv_total_opens != 0) {
|
|
err = SET_ERROR(EBUSY);
|
|
goto out;
|
|
}
|
|
zv->zv_flags |= ZVOL_EXCL;
|
|
}
|
|
#endif
|
|
|
|
#ifdef illumos
|
|
if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
|
|
zv->zv_open_count[otyp]++;
|
|
zv->zv_total_opens++;
|
|
}
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#else
|
|
zv->zv_total_opens += count;
|
|
if (locked)
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#endif
|
|
|
|
return (err);
|
|
out:
|
|
if (zv->zv_total_opens == 0)
|
|
zvol_last_close(zv);
|
|
#ifdef illumos
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#else
|
|
if (locked)
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#endif
|
|
return (err);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
#ifdef illumos
|
|
int
|
|
zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
|
|
{
|
|
minor_t minor = getminor(dev);
|
|
zvol_state_t *zv;
|
|
int error = 0;
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
|
|
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
|
|
if (zv == NULL) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#else /* !illumos */
|
|
static int
|
|
zvol_close(struct g_provider *pp, int flag, int count)
|
|
{
|
|
zvol_state_t *zv;
|
|
int error = 0;
|
|
boolean_t locked = B_FALSE;
|
|
|
|
/* See comment in zvol_open(). */
|
|
if (!MUTEX_HELD(&zfsdev_state_lock)) {
|
|
mutex_enter(&zfsdev_state_lock);
|
|
locked = B_TRUE;
|
|
}
|
|
|
|
zv = pp->private;
|
|
if (zv == NULL) {
|
|
if (locked)
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#endif /* illumos */
|
|
return (SET_ERROR(ENXIO));
|
|
}
|
|
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
|
ASSERT(zv->zv_total_opens == 1);
|
|
zv->zv_flags &= ~ZVOL_EXCL;
|
|
}
|
|
|
|
/*
|
|
* If the open count is zero, this is a spurious close.
|
|
* That indicates a bug in the kernel / DDI framework.
|
|
*/
|
|
#ifdef illumos
|
|
ASSERT(zv->zv_open_count[otyp] != 0);
|
|
#endif
|
|
ASSERT(zv->zv_total_opens != 0);
|
|
|
|
/*
|
|
* You may get multiple opens, but only one close.
|
|
*/
|
|
#ifdef illumos
|
|
zv->zv_open_count[otyp]--;
|
|
zv->zv_total_opens--;
|
|
#else
|
|
zv->zv_total_opens -= count;
|
|
#endif
|
|
|
|
if (zv->zv_total_opens == 0)
|
|
zvol_last_close(zv);
|
|
|
|
#ifdef illumos
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#else
|
|
if (locked)
|
|
mutex_exit(&zfsdev_state_lock);
|
|
#endif
|
|
return (error);
|
|
}
|
|
|
|
static void
|
|
zvol_get_done(zgd_t *zgd, int error)
|
|
{
|
|
if (zgd->zgd_db)
|
|
dmu_buf_rele(zgd->zgd_db, zgd);
|
|
|
|
zfs_range_unlock(zgd->zgd_rl);
|
|
|
|
if (error == 0 && zgd->zgd_bp)
|
|
zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
|
|
|
|
kmem_free(zgd, sizeof (zgd_t));
|
|
}
|
|
|
|
/*
|
|
* Get data to generate a TX_WRITE intent log record.
|
|
*/
|
|
static int
|
|
zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
|
{
|
|
zvol_state_t *zv = arg;
|
|
objset_t *os = zv->zv_objset;
|
|
uint64_t object = ZVOL_OBJ;
|
|
uint64_t offset = lr->lr_offset;
|
|
uint64_t size = lr->lr_length; /* length of user data */
|
|
blkptr_t *bp = &lr->lr_blkptr;
|
|
dmu_buf_t *db;
|
|
zgd_t *zgd;
|
|
int error;
|
|
|
|
ASSERT(zio != NULL);
|
|
ASSERT(size != 0);
|
|
|
|
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
|
zgd->zgd_zilog = zv->zv_zilog;
|
|
zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
|
|
|
|
/*
|
|
* Write records come in two flavors: immediate and indirect.
|
|
* For small writes it's cheaper to store the data with the
|
|
* log record (immediate); for large writes it's cheaper to
|
|
* sync the data and get a pointer to it (indirect) so that
|
|
* we don't have to write the data twice.
|
|
*/
|
|
if (buf != NULL) { /* immediate write */
|
|
error = dmu_read(os, object, offset, size, buf,
|
|
DMU_READ_NO_PREFETCH);
|
|
} else {
|
|
size = zv->zv_volblocksize;
|
|
offset = P2ALIGN(offset, size);
|
|
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
|
DMU_READ_NO_PREFETCH);
|
|
if (error == 0) {
|
|
blkptr_t *obp = dmu_buf_get_blkptr(db);
|
|
if (obp) {
|
|
ASSERT(BP_IS_HOLE(bp));
|
|
*bp = *obp;
|
|
}
|
|
|
|
zgd->zgd_db = db;
|
|
zgd->zgd_bp = bp;
|
|
|
|
ASSERT(db->db_offset == offset);
|
|
ASSERT(db->db_size == size);
|
|
|
|
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
|
zvol_get_done, zgd);
|
|
|
|
if (error == 0)
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
zvol_get_done(zgd, error);
|
|
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
|
|
*
|
|
* We store data in the log buffers if it's small enough.
|
|
* Otherwise we will later flush the data out via dmu_sync().
|
|
*/
|
|
ssize_t zvol_immediate_write_sz = 32768;
|
|
#ifdef _KERNEL
|
|
SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
|
|
&zvol_immediate_write_sz, 0, "Minimal size for indirect log write");
|
|
#endif
|
|
|
|
static void
|
|
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
|
|
boolean_t sync)
|
|
{
|
|
uint32_t blocksize = zv->zv_volblocksize;
|
|
zilog_t *zilog = zv->zv_zilog;
|
|
itx_wr_state_t write_state;
|
|
|
|
if (zil_replaying(zilog, tx))
|
|
return;
|
|
|
|
if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
|
|
write_state = WR_INDIRECT;
|
|
else if (!spa_has_slogs(zilog->zl_spa) &&
|
|
resid >= blocksize && blocksize > zvol_immediate_write_sz)
|
|
write_state = WR_INDIRECT;
|
|
else if (sync)
|
|
write_state = WR_COPIED;
|
|
else
|
|
write_state = WR_NEED_COPY;
|
|
|
|
while (resid) {
|
|
itx_t *itx;
|
|
lr_write_t *lr;
|
|
itx_wr_state_t wr_state = write_state;
|
|
ssize_t len = resid;
|
|
|
|
if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
|
|
wr_state = WR_NEED_COPY;
|
|
else if (wr_state == WR_INDIRECT)
|
|
len = MIN(blocksize - P2PHASE(off, blocksize), resid);
|
|
|
|
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
|
|
(wr_state == WR_COPIED ? len : 0));
|
|
lr = (lr_write_t *)&itx->itx_lr;
|
|
if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
|
|
ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
|
|
zil_itx_destroy(itx);
|
|
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
|
|
lr = (lr_write_t *)&itx->itx_lr;
|
|
wr_state = WR_NEED_COPY;
|
|
}
|
|
|
|
itx->itx_wr_state = wr_state;
|
|
lr->lr_foid = ZVOL_OBJ;
|
|
lr->lr_offset = off;
|
|
lr->lr_length = len;
|
|
lr->lr_blkoff = 0;
|
|
BP_ZERO(&lr->lr_blkptr);
|
|
|
|
itx->itx_private = zv;
|
|
|
|
if (!sync && (zv->zv_sync_cnt == 0))
|
|
itx->itx_sync = B_FALSE;
|
|
|
|
zil_itx_assign(zilog, itx, tx);
|
|
|
|
off += len;
|
|
resid -= len;
|
|
}
|
|
}
|
|
|
|
#ifdef illumos
|
|
static int
|
|
zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
|
|
uint64_t size, boolean_t doread, boolean_t isdump)
|
|
{
|
|
vdev_disk_t *dvd;
|
|
int c;
|
|
int numerrors = 0;
|
|
|
|
if (vd->vdev_ops == &vdev_mirror_ops ||
|
|
vd->vdev_ops == &vdev_replacing_ops ||
|
|
vd->vdev_ops == &vdev_spare_ops) {
|
|
for (c = 0; c < vd->vdev_children; c++) {
|
|
int err = zvol_dumpio_vdev(vd->vdev_child[c],
|
|
addr, offset, origoffset, size, doread, isdump);
|
|
if (err != 0) {
|
|
numerrors++;
|
|
} else if (doread) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
|
|
return (numerrors < vd->vdev_children ? 0 : EIO);
|
|
|
|
if (doread && !vdev_readable(vd))
|
|
return (SET_ERROR(EIO));
|
|
else if (!doread && !vdev_writeable(vd))
|
|
return (SET_ERROR(EIO));
|
|
|
|
if (vd->vdev_ops == &vdev_raidz_ops) {
|
|
return (vdev_raidz_physio(vd,
|
|
addr, size, offset, origoffset, doread, isdump));
|
|
}
|
|
|
|
offset += VDEV_LABEL_START_SIZE;
|
|
|
|
if (ddi_in_panic() || isdump) {
|
|
ASSERT(!doread);
|
|
if (doread)
|
|
return (SET_ERROR(EIO));
|
|
dvd = vd->vdev_tsd;
|
|
ASSERT3P(dvd, !=, NULL);
|
|
return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
|
|
lbtodb(size)));
|
|
} else {
|
|
dvd = vd->vdev_tsd;
|
|
ASSERT3P(dvd, !=, NULL);
|
|
return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
|
|
offset, doread ? B_READ : B_WRITE));
|
|
}
|
|
}
|
|
|
|
static int
|
|
zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
|
|
boolean_t doread, boolean_t isdump)
|
|
{
|
|
vdev_t *vd;
|
|
int error;
|
|
zvol_extent_t *ze;
|
|
spa_t *spa = dmu_objset_spa(zv->zv_objset);
|
|
|
|
/* Must be sector aligned, and not stradle a block boundary. */
|
|
if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
|
|
P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
|
|
return (SET_ERROR(EINVAL));
|
|
}
|
|
ASSERT(size <= zv->zv_volblocksize);
|
|
|
|
/* Locate the extent this belongs to */
|
|
ze = list_head(&zv->zv_extents);
|
|
while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
|
|
offset -= ze->ze_nblks * zv->zv_volblocksize;
|
|
ze = list_next(&zv->zv_extents, ze);
|
|
}
|
|
|
|
if (ze == NULL)
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
if (!ddi_in_panic())
|
|
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
|
|
|
|
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
|
|
offset += DVA_GET_OFFSET(&ze->ze_dva);
|
|
error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
|
|
size, doread, isdump);
|
|
|
|
if (!ddi_in_panic())
|
|
spa_config_exit(spa, SCL_STATE, FTAG);
|
|
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
zvol_strategy(buf_t *bp)
|
|
{
|
|
zfs_soft_state_t *zs = NULL;
|
|
#else /* !illumos */
|
|
void
|
|
zvol_strategy(struct bio *bp)
|
|
{
|
|
#endif /* illumos */
|
|
zvol_state_t *zv;
|
|
uint64_t off, volsize;
|
|
size_t resid;
|
|
char *addr;
|
|
objset_t *os;
|
|
rl_t *rl;
|
|
int error = 0;
|
|
#ifdef illumos
|
|
boolean_t doread = bp->b_flags & B_READ;
|
|
#else
|
|
boolean_t doread = 0;
|
|
#endif
|
|
boolean_t is_dumpified;
|
|
boolean_t sync;
|
|
|
|
#ifdef illumos
|
|
if (getminor(bp->b_edev) == 0) {
|
|
error = SET_ERROR(EINVAL);
|
|
} else {
|
|
zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
|
|
if (zs == NULL)
|
|
error = SET_ERROR(ENXIO);
|
|
else if (zs->zss_type != ZSST_ZVOL)
|
|
error = SET_ERROR(EINVAL);
|
|
}
|
|
|
|
if (error) {
|
|
bioerror(bp, error);
|
|
biodone(bp);
|
|
return (0);
|
|
}
|
|
|
|
zv = zs->zss_data;
|
|
|
|
if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
|
|
bioerror(bp, EROFS);
|
|
biodone(bp);
|
|
return (0);
|
|
}
|
|
|
|
off = ldbtob(bp->b_blkno);
|
|
#else /* !illumos */
|
|
if (bp->bio_to)
|
|
zv = bp->bio_to->private;
|
|
else
|
|
zv = bp->bio_dev->si_drv2;
|
|
|
|
if (zv == NULL) {
|
|
error = SET_ERROR(ENXIO);
|
|
goto out;
|
|
}
|
|
|
|
if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
|
|
error = SET_ERROR(EROFS);
|
|
goto out;
|
|
}
|
|
|
|
switch (bp->bio_cmd) {
|
|
case BIO_FLUSH:
|
|
goto sync;
|
|
case BIO_READ:
|
|
doread = 1;
|
|
case BIO_WRITE:
|
|
case BIO_DELETE:
|
|
break;
|
|
default:
|
|
error = EOPNOTSUPP;
|
|
goto out;
|
|
}
|
|
|
|
off = bp->bio_offset;
|
|
#endif /* illumos */
|
|
volsize = zv->zv_volsize;
|
|
|
|
os = zv->zv_objset;
|
|
ASSERT(os != NULL);
|
|
|
|
#ifdef illumos
|
|
bp_mapin(bp);
|
|
addr = bp->b_un.b_addr;
|
|
resid = bp->b_bcount;
|
|
|
|
if (resid > 0 && (off < 0 || off >= volsize)) {
|
|
bioerror(bp, EIO);
|
|
biodone(bp);
|
|
return (0);
|
|
}
|
|
|
|
is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
|
|
sync = ((!(bp->b_flags & B_ASYNC) &&
|
|
!(zv->zv_flags & ZVOL_WCE)) ||
|
|
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
|
|
!doread && !is_dumpified;
|
|
#else /* !illumos */
|
|
addr = bp->bio_data;
|
|
resid = bp->bio_length;
|
|
|
|
if (resid > 0 && (off < 0 || off >= volsize)) {
|
|
error = SET_ERROR(EIO);
|
|
goto out;
|
|
}
|
|
|
|
is_dumpified = B_FALSE;
|
|
sync = !doread && !is_dumpified &&
|
|
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
|
|
#endif /* illumos */
|
|
|
|
/*
|
|
* There must be no buffer changes when doing a dmu_sync() because
|
|
* we can't change the data whilst calculating the checksum.
|
|
*/
|
|
rl = zfs_range_lock(&zv->zv_znode, off, resid,
|
|
doread ? RL_READER : RL_WRITER);
|
|
|
|
#ifndef illumos
|
|
if (bp->bio_cmd == BIO_DELETE) {
|
|
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error != 0) {
|
|
dmu_tx_abort(tx);
|
|
} else {
|
|
zvol_log_truncate(zv, tx, off, resid, sync);
|
|
dmu_tx_commit(tx);
|
|
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
|
|
off, resid);
|
|
resid = 0;
|
|
}
|
|
goto unlock;
|
|
}
|
|
#endif
|
|
while (resid != 0 && off < volsize) {
|
|
size_t size = MIN(resid, zvol_maxphys);
|
|
#ifdef illumos
|
|
if (is_dumpified) {
|
|
size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
|
|
error = zvol_dumpio(zv, addr, off, size,
|
|
doread, B_FALSE);
|
|
} else if (doread) {
|
|
#else
|
|
if (doread) {
|
|
#endif
|
|
error = dmu_read(os, ZVOL_OBJ, off, size, addr,
|
|
DMU_READ_PREFETCH);
|
|
} else {
|
|
dmu_tx_t *tx = dmu_tx_create(os);
|
|
dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
} else {
|
|
dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
|
|
zvol_log_write(zv, tx, off, size, sync);
|
|
dmu_tx_commit(tx);
|
|
}
|
|
}
|
|
if (error) {
|
|
/* convert checksum errors into IO errors */
|
|
if (error == ECKSUM)
|
|
error = SET_ERROR(EIO);
|
|
break;
|
|
}
|
|
off += size;
|
|
addr += size;
|
|
resid -= size;
|
|
}
|
|
#ifndef illumos
|
|
unlock:
|
|
#endif
|
|
zfs_range_unlock(rl);
|
|
|
|
#ifdef illumos
|
|
if ((bp->b_resid = resid) == bp->b_bcount)
|
|
bioerror(bp, off > volsize ? EINVAL : error);
|
|
|
|
if (sync)
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
biodone(bp);
|
|
|
|
return (0);
|
|
#else /* !illumos */
|
|
bp->bio_completed = bp->bio_length - resid;
|
|
if (bp->bio_completed < bp->bio_length && off > volsize)
|
|
error = EINVAL;
|
|
|
|
if (sync) {
|
|
sync:
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
}
|
|
out:
|
|
if (bp->bio_to)
|
|
g_io_deliver(bp, error);
|
|
else
|
|
biofinish(bp, NULL, error);
|
|
#endif /* illumos */
|
|
}
|
|
|
|
#ifdef illumos
|
|
/*
|
|
* Set the buffer count to the zvol maximum transfer.
|
|
* Using our own routine instead of the default minphys()
|
|
* means that for larger writes we write bigger buffers on X86
|
|
* (128K instead of 56K) and flush the disk write cache less often
|
|
* (every zvol_maxphys - currently 1MB) instead of minphys (currently
|
|
* 56K on X86 and 128K on sparc).
|
|
*/
|
|
void
|
|
zvol_minphys(struct buf *bp)
|
|
{
|
|
if (bp->b_bcount > zvol_maxphys)
|
|
bp->b_bcount = zvol_maxphys;
|
|
}
|
|
|
|
int
|
|
zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
|
|
{
|
|
minor_t minor = getminor(dev);
|
|
zvol_state_t *zv;
|
|
int error = 0;
|
|
uint64_t size;
|
|
uint64_t boff;
|
|
uint64_t resid;
|
|
|
|
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
|
|
if (zv == NULL)
|
|
return (SET_ERROR(ENXIO));
|
|
|
|
if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
boff = ldbtob(blkno);
|
|
resid = ldbtob(nblocks);
|
|
|
|
VERIFY3U(boff + resid, <=, zv->zv_volsize);
|
|
|
|
while (resid) {
|
|
size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
|
|
error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
|
|
if (error)
|
|
break;
|
|
boff += size;
|
|
addr += size;
|
|
resid -= size;
|
|
}
|
|
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
int
|
|
zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
|
|
{
|
|
minor_t minor = getminor(dev);
|
|
#else /* !illumos */
|
|
int
|
|
zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
|
|
{
|
|
#endif /* illumos */
|
|
zvol_state_t *zv;
|
|
uint64_t volsize;
|
|
rl_t *rl;
|
|
int error = 0;
|
|
|
|
#ifdef illumos
|
|
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
|
|
if (zv == NULL)
|
|
return (SET_ERROR(ENXIO));
|
|
#else
|
|
zv = dev->si_drv2;
|
|
#endif
|
|
|
|
volsize = zv->zv_volsize;
|
|
/* uio_loffset == volsize isn't an error as its required for EOF processing. */
|
|
if (uio->uio_resid > 0 &&
|
|
(uio->uio_loffset < 0 || uio->uio_loffset > volsize))
|
|
return (SET_ERROR(EIO));
|
|
|
|
#ifdef illumos
|
|
if (zv->zv_flags & ZVOL_DUMPIFIED) {
|
|
error = physio(zvol_strategy, NULL, dev, B_READ,
|
|
zvol_minphys, uio);
|
|
return (error);
|
|
}
|
|
#endif
|
|
|
|
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
|
|
RL_READER);
|
|
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
|
|
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
|
|
|
|
/* don't read past the end */
|
|
if (bytes > volsize - uio->uio_loffset)
|
|
bytes = volsize - uio->uio_loffset;
|
|
|
|
error = dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
|
|
if (error) {
|
|
/* convert checksum errors into IO errors */
|
|
if (error == ECKSUM)
|
|
error = SET_ERROR(EIO);
|
|
break;
|
|
}
|
|
}
|
|
zfs_range_unlock(rl);
|
|
return (error);
|
|
}
|
|
|
|
#ifdef illumos
|
|
/*ARGSUSED*/
|
|
int
|
|
zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
|
|
{
|
|
minor_t minor = getminor(dev);
|
|
#else /* !illumos */
|
|
int
|
|
zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
|
|
{
|
|
#endif /* illumos */
|
|
zvol_state_t *zv;
|
|
uint64_t volsize;
|
|
rl_t *rl;
|
|
int error = 0;
|
|
boolean_t sync;
|
|
|
|
#ifdef illumos
|
|
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
|
|
if (zv == NULL)
|
|
return (SET_ERROR(ENXIO));
|
|
#else
|
|
zv = dev->si_drv2;
|
|
#endif
|
|
|
|
volsize = zv->zv_volsize;
|
|
/* uio_loffset == volsize isn't an error as its required for EOF processing. */
|
|
if (uio->uio_resid > 0 &&
|
|
(uio->uio_loffset < 0 || uio->uio_loffset > volsize))
|
|
return (SET_ERROR(EIO));
|
|
|
|
#ifdef illumos
|
|
if (zv->zv_flags & ZVOL_DUMPIFIED) {
|
|
error = physio(zvol_strategy, NULL, dev, B_WRITE,
|
|
zvol_minphys, uio);
|
|
return (error);
|
|
}
|
|
|
|
sync = !(zv->zv_flags & ZVOL_WCE) ||
|
|
#else
|
|
sync = (ioflag & IO_SYNC) ||
|
|
#endif
|
|
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
|
|
|
|
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
|
|
RL_WRITER);
|
|
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
|
|
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
|
|
uint64_t off = uio->uio_loffset;
|
|
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
|
|
|
|
if (bytes > volsize - off) /* don't write past the end */
|
|
bytes = volsize - off;
|
|
|
|
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
break;
|
|
}
|
|
error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
|
|
if (error == 0)
|
|
zvol_log_write(zv, tx, off, bytes, sync);
|
|
dmu_tx_commit(tx);
|
|
|
|
if (error)
|
|
break;
|
|
}
|
|
zfs_range_unlock(rl);
|
|
if (sync)
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
return (error);
|
|
}
|
|
|
|
#ifdef illumos
|
|
int
|
|
zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
|
|
{
|
|
struct uuid uuid = EFI_RESERVED;
|
|
efi_gpe_t gpe = { 0 };
|
|
uint32_t crc;
|
|
dk_efi_t efi;
|
|
int length;
|
|
char *ptr;
|
|
|
|
if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
|
|
return (SET_ERROR(EFAULT));
|
|
ptr = (char *)(uintptr_t)efi.dki_data_64;
|
|
length = efi.dki_length;
|
|
/*
|
|
* Some clients may attempt to request a PMBR for the
|
|
* zvol. Currently this interface will return EINVAL to
|
|
* such requests. These requests could be supported by
|
|
* adding a check for lba == 0 and consing up an appropriate
|
|
* PMBR.
|
|
*/
|
|
if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
gpe.efi_gpe_StartingLBA = LE_64(34ULL);
|
|
gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
|
|
UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
|
|
|
|
if (efi.dki_lba == 1) {
|
|
efi_gpt_t gpt = { 0 };
|
|
|
|
gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
|
|
gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
|
|
gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
|
|
gpt.efi_gpt_MyLBA = LE_64(1ULL);
|
|
gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
|
|
gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
|
|
gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
|
|
gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
|
|
gpt.efi_gpt_SizeOfPartitionEntry =
|
|
LE_32(sizeof (efi_gpe_t));
|
|
CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
|
|
gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
|
|
CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
|
|
gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
|
|
if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
|
|
flag))
|
|
return (SET_ERROR(EFAULT));
|
|
ptr += sizeof (gpt);
|
|
length -= sizeof (gpt);
|
|
}
|
|
if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
|
|
length), flag))
|
|
return (SET_ERROR(EFAULT));
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* BEGIN entry points to allow external callers access to the volume.
|
|
*/
|
|
/*
|
|
* Return the volume parameters needed for access from an external caller.
|
|
* These values are invariant as long as the volume is held open.
|
|
*/
|
|
int
|
|
zvol_get_volume_params(minor_t minor, uint64_t *blksize,
|
|
uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
|
|
void **rl_hdl, void **bonus_hdl)
|
|
{
|
|
zvol_state_t *zv;
|
|
|
|
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
|
|
if (zv == NULL)
|
|
return (SET_ERROR(ENXIO));
|
|
if (zv->zv_flags & ZVOL_DUMPIFIED)
|
|
return (SET_ERROR(ENXIO));
|
|
|
|
ASSERT(blksize && max_xfer_len && minor_hdl &&
|
|
objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
|
|
|
|
*blksize = zv->zv_volblocksize;
|
|
*max_xfer_len = (uint64_t)zvol_maxphys;
|
|
*minor_hdl = zv;
|
|
*objset_hdl = zv->zv_objset;
|
|
*zil_hdl = zv->zv_zilog;
|
|
*rl_hdl = &zv->zv_znode;
|
|
*bonus_hdl = zv->zv_dbuf;
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Return the current volume size to an external caller.
|
|
* The size can change while the volume is open.
|
|
*/
|
|
uint64_t
|
|
zvol_get_volume_size(void *minor_hdl)
|
|
{
|
|
zvol_state_t *zv = minor_hdl;
|
|
|
|
return (zv->zv_volsize);
|
|
}
|
|
|
|
/*
|
|
* Return the current WCE setting to an external caller.
|
|
* The WCE setting can change while the volume is open.
|
|
*/
|
|
int
|
|
zvol_get_volume_wce(void *minor_hdl)
|
|
{
|
|
zvol_state_t *zv = minor_hdl;
|
|
|
|
return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
|
|
}
|
|
|
|
/*
|
|
* Entry point for external callers to zvol_log_write
|
|
*/
|
|
void
|
|
zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
|
|
boolean_t sync)
|
|
{
|
|
zvol_state_t *zv = minor_hdl;
|
|
|
|
zvol_log_write(zv, tx, off, resid, sync);
|
|
}
|
|
/*
|
|
* END entry points to allow external callers access to the volume.
|
|
*/
|
|
#endif /* illumos */
|
|
|
|
/*
|
|
* Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
|
|
*/
|
|
static void
|
|
zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
|
|
boolean_t sync)
|
|
{
|
|
itx_t *itx;
|
|
lr_truncate_t *lr;
|
|
zilog_t *zilog = zv->zv_zilog;
|
|
|
|
if (zil_replaying(zilog, tx))
|
|
return;
|
|
|
|
itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
|
|
lr = (lr_truncate_t *)&itx->itx_lr;
|
|
lr->lr_foid = ZVOL_OBJ;
|
|
lr->lr_offset = off;
|
|
lr->lr_length = len;
|
|
|
|
itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
|
|
zil_itx_assign(zilog, itx, tx);
|
|
}
|
|
|
|
#ifdef illumos
|
|
/*
|
|
* Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
|
|
* Also a dirtbag dkio ioctl for unmap/free-block functionality.
|
|
*/
|
|
/*ARGSUSED*/
|
|
int
|
|
zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
|
|
{
|
|
zvol_state_t *zv;
|
|
struct dk_callback *dkc;
|
|
int error = 0;
|
|
rl_t *rl;
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
|
|
zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
|
|
|
|
if (zv == NULL) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (SET_ERROR(ENXIO));
|
|
}
|
|
ASSERT(zv->zv_total_opens > 0);
|
|
|
|
switch (cmd) {
|
|
|
|
case DKIOCINFO:
|
|
{
|
|
struct dk_cinfo dki;
|
|
|
|
bzero(&dki, sizeof (dki));
|
|
(void) strcpy(dki.dki_cname, "zvol");
|
|
(void) strcpy(dki.dki_dname, "zvol");
|
|
dki.dki_ctype = DKC_UNKNOWN;
|
|
dki.dki_unit = getminor(dev);
|
|
dki.dki_maxtransfer =
|
|
1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
|
|
error = SET_ERROR(EFAULT);
|
|
return (error);
|
|
}
|
|
|
|
case DKIOCGMEDIAINFO:
|
|
{
|
|
struct dk_minfo dkm;
|
|
|
|
bzero(&dkm, sizeof (dkm));
|
|
dkm.dki_lbsize = 1U << zv->zv_min_bs;
|
|
dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
|
|
dkm.dki_media_type = DK_UNKNOWN;
|
|
mutex_exit(&zfsdev_state_lock);
|
|
if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
|
|
error = SET_ERROR(EFAULT);
|
|
return (error);
|
|
}
|
|
|
|
case DKIOCGMEDIAINFOEXT:
|
|
{
|
|
struct dk_minfo_ext dkmext;
|
|
|
|
bzero(&dkmext, sizeof (dkmext));
|
|
dkmext.dki_lbsize = 1U << zv->zv_min_bs;
|
|
dkmext.dki_pbsize = zv->zv_volblocksize;
|
|
dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
|
|
dkmext.dki_media_type = DK_UNKNOWN;
|
|
mutex_exit(&zfsdev_state_lock);
|
|
if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
|
|
error = SET_ERROR(EFAULT);
|
|
return (error);
|
|
}
|
|
|
|
case DKIOCGETEFI:
|
|
{
|
|
uint64_t vs = zv->zv_volsize;
|
|
uint8_t bs = zv->zv_min_bs;
|
|
|
|
mutex_exit(&zfsdev_state_lock);
|
|
error = zvol_getefi((void *)arg, flag, vs, bs);
|
|
return (error);
|
|
}
|
|
|
|
case DKIOCFLUSHWRITECACHE:
|
|
dkc = (struct dk_callback *)arg;
|
|
mutex_exit(&zfsdev_state_lock);
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
|
|
(*dkc->dkc_callback)(dkc->dkc_cookie, error);
|
|
error = 0;
|
|
}
|
|
return (error);
|
|
|
|
case DKIOCGETWCE:
|
|
{
|
|
int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
|
|
if (ddi_copyout(&wce, (void *)arg, sizeof (int),
|
|
flag))
|
|
error = SET_ERROR(EFAULT);
|
|
break;
|
|
}
|
|
case DKIOCSETWCE:
|
|
{
|
|
int wce;
|
|
if (ddi_copyin((void *)arg, &wce, sizeof (int),
|
|
flag)) {
|
|
error = SET_ERROR(EFAULT);
|
|
break;
|
|
}
|
|
if (wce) {
|
|
zv->zv_flags |= ZVOL_WCE;
|
|
mutex_exit(&zfsdev_state_lock);
|
|
} else {
|
|
zv->zv_flags &= ~ZVOL_WCE;
|
|
mutex_exit(&zfsdev_state_lock);
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
case DKIOCGGEOM:
|
|
case DKIOCGVTOC:
|
|
/*
|
|
* commands using these (like prtvtoc) expect ENOTSUP
|
|
* since we're emulating an EFI label
|
|
*/
|
|
error = SET_ERROR(ENOTSUP);
|
|
break;
|
|
|
|
case DKIOCDUMPINIT:
|
|
rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
|
|
RL_WRITER);
|
|
error = zvol_dumpify(zv);
|
|
zfs_range_unlock(rl);
|
|
break;
|
|
|
|
case DKIOCDUMPFINI:
|
|
if (!(zv->zv_flags & ZVOL_DUMPIFIED))
|
|
break;
|
|
rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
|
|
RL_WRITER);
|
|
error = zvol_dump_fini(zv);
|
|
zfs_range_unlock(rl);
|
|
break;
|
|
|
|
case DKIOCFREE:
|
|
{
|
|
dkioc_free_t df;
|
|
dmu_tx_t *tx;
|
|
|
|
if (!zvol_unmap_enabled)
|
|
break;
|
|
|
|
if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
|
|
error = SET_ERROR(EFAULT);
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Apply Postel's Law to length-checking. If they overshoot,
|
|
* just blank out until the end, if there's a need to blank
|
|
* out anything.
|
|
*/
|
|
if (df.df_start >= zv->zv_volsize)
|
|
break; /* No need to do anything... */
|
|
|
|
mutex_exit(&zfsdev_state_lock);
|
|
|
|
rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
|
|
RL_WRITER);
|
|
tx = dmu_tx_create(zv->zv_objset);
|
|
dmu_tx_mark_netfree(tx);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error != 0) {
|
|
dmu_tx_abort(tx);
|
|
} else {
|
|
zvol_log_truncate(zv, tx, df.df_start,
|
|
df.df_length, B_TRUE);
|
|
dmu_tx_commit(tx);
|
|
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
|
|
df.df_start, df.df_length);
|
|
}
|
|
|
|
zfs_range_unlock(rl);
|
|
|
|
if (error == 0) {
|
|
/*
|
|
* If the write-cache is disabled or 'sync' property
|
|
* is set to 'always' then treat this as a synchronous
|
|
* operation (i.e. commit to zil).
|
|
*/
|
|
if (!(zv->zv_flags & ZVOL_WCE) ||
|
|
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
|
|
/*
|
|
* If the caller really wants synchronous writes, and
|
|
* can't wait for them, don't return until the write
|
|
* is done.
|
|
*/
|
|
if (df.df_flags & DF_WAIT_SYNC) {
|
|
txg_wait_synced(
|
|
dmu_objset_pool(zv->zv_objset), 0);
|
|
}
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
default:
|
|
error = SET_ERROR(ENOTTY);
|
|
break;
|
|
|
|
}
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (error);
|
|
}
|
|
#endif /* illumos */
|
|
|
|
int
|
|
zvol_busy(void)
|
|
{
|
|
return (zvol_minors != 0);
|
|
}
|
|
|
|
void
|
|
zvol_init(void)
|
|
{
|
|
VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
|
|
1) == 0);
|
|
#ifdef illumos
|
|
mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
#else
|
|
ZFS_LOG(1, "ZVOL Initialized.");
|
|
#endif
|
|
}
|
|
|
|
void
|
|
zvol_fini(void)
|
|
{
|
|
#ifdef illumos
|
|
mutex_destroy(&zfsdev_state_lock);
|
|
#endif
|
|
ddi_soft_state_fini(&zfsdev_state);
|
|
ZFS_LOG(1, "ZVOL Deinitialized.");
|
|
}
|
|
|
|
#ifdef illumos
|
|
/*ARGSUSED*/
|
|
static int
|
|
zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
|
|
{
|
|
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
|
|
|
if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
|
|
return (1);
|
|
return (0);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static void
|
|
zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
|
|
{
|
|
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
|
|
|
spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
|
|
}
|
|
|
|
static int
|
|
zvol_dump_init(zvol_state_t *zv, boolean_t resize)
|
|
{
|
|
dmu_tx_t *tx;
|
|
int error;
|
|
objset_t *os = zv->zv_objset;
|
|
spa_t *spa = dmu_objset_spa(os);
|
|
vdev_t *vd = spa->spa_root_vdev;
|
|
nvlist_t *nv = NULL;
|
|
uint64_t version = spa_version(spa);
|
|
uint64_t checksum, compress, refresrv, vbs, dedup;
|
|
|
|
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
|
ASSERT(vd->vdev_ops == &vdev_root_ops);
|
|
|
|
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
|
|
DMU_OBJECT_END);
|
|
if (error != 0)
|
|
return (error);
|
|
/* wait for dmu_free_long_range to actually free the blocks */
|
|
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
|
|
|
|
/*
|
|
* If the pool on which the dump device is being initialized has more
|
|
* than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
|
|
* enabled. If so, bump that feature's counter to indicate that the
|
|
* feature is active. We also check the vdev type to handle the
|
|
* following case:
|
|
* # zpool create test raidz disk1 disk2 disk3
|
|
* Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
|
|
* the raidz vdev itself has 3 children.
|
|
*/
|
|
if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
|
|
if (!spa_feature_is_enabled(spa,
|
|
SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
|
|
return (SET_ERROR(ENOTSUP));
|
|
(void) dsl_sync_task(spa_name(spa),
|
|
zfs_mvdev_dump_feature_check,
|
|
zfs_mvdev_dump_activate_feature_sync, NULL,
|
|
2, ZFS_SPACE_CHECK_RESERVED);
|
|
}
|
|
|
|
if (!resize) {
|
|
error = dsl_prop_get_integer(zv->zv_name,
|
|
zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
|
|
if (error == 0) {
|
|
error = dsl_prop_get_integer(zv->zv_name,
|
|
zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
|
|
NULL);
|
|
}
|
|
if (error == 0) {
|
|
error = dsl_prop_get_integer(zv->zv_name,
|
|
zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
|
|
&refresrv, NULL);
|
|
}
|
|
if (error == 0) {
|
|
error = dsl_prop_get_integer(zv->zv_name,
|
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
|
|
NULL);
|
|
}
|
|
if (version >= SPA_VERSION_DEDUP && error == 0) {
|
|
error = dsl_prop_get_integer(zv->zv_name,
|
|
zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
|
|
}
|
|
}
|
|
if (error != 0)
|
|
return (error);
|
|
|
|
tx = dmu_tx_create(os);
|
|
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
|
|
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error != 0) {
|
|
dmu_tx_abort(tx);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* If we are resizing the dump device then we only need to
|
|
* update the refreservation to match the newly updated
|
|
* zvolsize. Otherwise, we save off the original state of the
|
|
* zvol so that we can restore them if the zvol is ever undumpified.
|
|
*/
|
|
if (resize) {
|
|
error = zap_update(os, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
|
|
&zv->zv_volsize, tx);
|
|
} else {
|
|
error = zap_update(os, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
|
|
&compress, tx);
|
|
if (error == 0) {
|
|
error = zap_update(os, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
|
|
&checksum, tx);
|
|
}
|
|
if (error == 0) {
|
|
error = zap_update(os, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
|
|
&refresrv, tx);
|
|
}
|
|
if (error == 0) {
|
|
error = zap_update(os, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
|
|
&vbs, tx);
|
|
}
|
|
if (error == 0) {
|
|
error = dmu_object_set_blocksize(
|
|
os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
|
|
}
|
|
if (version >= SPA_VERSION_DEDUP && error == 0) {
|
|
error = zap_update(os, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
|
|
&dedup, tx);
|
|
}
|
|
if (error == 0)
|
|
zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
|
|
}
|
|
dmu_tx_commit(tx);
|
|
|
|
/*
|
|
* We only need update the zvol's property if we are initializing
|
|
* the dump area for the first time.
|
|
*/
|
|
if (error == 0 && !resize) {
|
|
/*
|
|
* If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
|
|
* function. Otherwise, use the old default -- OFF.
|
|
*/
|
|
checksum = spa_feature_is_active(spa,
|
|
SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
|
|
ZIO_CHECKSUM_OFF;
|
|
|
|
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
|
|
VERIFY(nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
|
|
VERIFY(nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
|
|
ZIO_COMPRESS_OFF) == 0);
|
|
VERIFY(nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
|
|
checksum) == 0);
|
|
if (version >= SPA_VERSION_DEDUP) {
|
|
VERIFY(nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_DEDUP),
|
|
ZIO_CHECKSUM_OFF) == 0);
|
|
}
|
|
|
|
error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
|
|
nv, NULL);
|
|
nvlist_free(nv);
|
|
}
|
|
|
|
/* Allocate the space for the dump */
|
|
if (error == 0)
|
|
error = zvol_prealloc(zv);
|
|
return (error);
|
|
}
|
|
|
|
static int
|
|
zvol_dumpify(zvol_state_t *zv)
|
|
{
|
|
int error = 0;
|
|
uint64_t dumpsize = 0;
|
|
dmu_tx_t *tx;
|
|
objset_t *os = zv->zv_objset;
|
|
|
|
if (zv->zv_flags & ZVOL_RDONLY)
|
|
return (SET_ERROR(EROFS));
|
|
|
|
if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
|
|
8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
|
|
boolean_t resize = (dumpsize > 0);
|
|
|
|
if ((error = zvol_dump_init(zv, resize)) != 0) {
|
|
(void) zvol_dump_fini(zv);
|
|
return (error);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Build up our lba mapping.
|
|
*/
|
|
error = zvol_get_lbas(zv);
|
|
if (error) {
|
|
(void) zvol_dump_fini(zv);
|
|
return (error);
|
|
}
|
|
|
|
tx = dmu_tx_create(os);
|
|
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
(void) zvol_dump_fini(zv);
|
|
return (error);
|
|
}
|
|
|
|
zv->zv_flags |= ZVOL_DUMPIFIED;
|
|
error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
|
|
&zv->zv_volsize, tx);
|
|
dmu_tx_commit(tx);
|
|
|
|
if (error) {
|
|
(void) zvol_dump_fini(zv);
|
|
return (error);
|
|
}
|
|
|
|
txg_wait_synced(dmu_objset_pool(os), 0);
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
zvol_dump_fini(zvol_state_t *zv)
|
|
{
|
|
dmu_tx_t *tx;
|
|
objset_t *os = zv->zv_objset;
|
|
nvlist_t *nv;
|
|
int error = 0;
|
|
uint64_t checksum, compress, refresrv, vbs, dedup;
|
|
uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
|
|
|
|
/*
|
|
* Attempt to restore the zvol back to its pre-dumpified state.
|
|
* This is a best-effort attempt as it's possible that not all
|
|
* of these properties were initialized during the dumpify process
|
|
* (i.e. error during zvol_dump_init).
|
|
*/
|
|
|
|
tx = dmu_tx_create(os);
|
|
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
return (error);
|
|
}
|
|
(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
|
|
dmu_tx_commit(tx);
|
|
|
|
(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
|
|
(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
|
|
(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
|
|
(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
|
|
|
|
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
|
|
(void) nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
|
|
(void) nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
|
|
(void) nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
|
|
if (version >= SPA_VERSION_DEDUP &&
|
|
zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
|
|
zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
|
|
(void) nvlist_add_uint64(nv,
|
|
zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
|
|
}
|
|
(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
|
|
nv, NULL);
|
|
nvlist_free(nv);
|
|
|
|
zvol_free_extents(zv);
|
|
zv->zv_flags &= ~ZVOL_DUMPIFIED;
|
|
(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
|
|
/* wait for dmu_free_long_range to actually free the blocks */
|
|
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
|
|
tx = dmu_tx_create(os);
|
|
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error) {
|
|
dmu_tx_abort(tx);
|
|
return (error);
|
|
}
|
|
if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
|
|
zv->zv_volblocksize = vbs;
|
|
dmu_tx_commit(tx);
|
|
|
|
return (0);
|
|
}
|
|
#else /* !illumos */
|
|
|
|
static void
|
|
zvol_geom_run(zvol_state_t *zv)
|
|
{
|
|
struct g_provider *pp;
|
|
|
|
pp = zv->zv_provider;
|
|
g_error_provider(pp, 0);
|
|
|
|
kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
|
|
"zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
|
|
}
|
|
|
|
static void
|
|
zvol_geom_destroy(zvol_state_t *zv)
|
|
{
|
|
struct g_provider *pp;
|
|
|
|
g_topology_assert();
|
|
|
|
mtx_lock(&zv->zv_queue_mtx);
|
|
zv->zv_state = 1;
|
|
wakeup_one(&zv->zv_queue);
|
|
while (zv->zv_state != 2)
|
|
msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
|
|
mtx_destroy(&zv->zv_queue_mtx);
|
|
|
|
pp = zv->zv_provider;
|
|
zv->zv_provider = NULL;
|
|
pp->private = NULL;
|
|
g_wither_geom(pp->geom, ENXIO);
|
|
}
|
|
|
|
static int
|
|
zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
|
|
{
|
|
int count, error, flags;
|
|
|
|
g_topology_assert();
|
|
|
|
/*
|
|
* To make it easier we expect either open or close, but not both
|
|
* at the same time.
|
|
*/
|
|
KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
|
|
(acr <= 0 && acw <= 0 && ace <= 0),
|
|
("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
|
|
pp->name, acr, acw, ace));
|
|
|
|
if (pp->private == NULL) {
|
|
if (acr <= 0 && acw <= 0 && ace <= 0)
|
|
return (0);
|
|
return (pp->error);
|
|
}
|
|
|
|
/*
|
|
* We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
|
|
* because GEOM already handles that and handles it a bit differently.
|
|
* GEOM allows for multiple read/exclusive consumers and ZFS allows
|
|
* only one exclusive consumer, no matter if it is reader or writer.
|
|
* I like better the way GEOM works so I'll leave it for GEOM to
|
|
* decide what to do.
|
|
*/
|
|
|
|
count = acr + acw + ace;
|
|
if (count == 0)
|
|
return (0);
|
|
|
|
flags = 0;
|
|
if (acr != 0 || ace != 0)
|
|
flags |= FREAD;
|
|
if (acw != 0)
|
|
flags |= FWRITE;
|
|
|
|
g_topology_unlock();
|
|
if (count > 0)
|
|
error = zvol_open(pp, flags, count);
|
|
else
|
|
error = zvol_close(pp, flags, -count);
|
|
g_topology_lock();
|
|
return (error);
|
|
}
|
|
|
|
static void
|
|
zvol_geom_start(struct bio *bp)
|
|
{
|
|
zvol_state_t *zv;
|
|
boolean_t first;
|
|
|
|
zv = bp->bio_to->private;
|
|
ASSERT(zv != NULL);
|
|
switch (bp->bio_cmd) {
|
|
case BIO_FLUSH:
|
|
if (!THREAD_CAN_SLEEP())
|
|
goto enqueue;
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
g_io_deliver(bp, 0);
|
|
break;
|
|
case BIO_READ:
|
|
case BIO_WRITE:
|
|
case BIO_DELETE:
|
|
if (!THREAD_CAN_SLEEP())
|
|
goto enqueue;
|
|
zvol_strategy(bp);
|
|
break;
|
|
case BIO_GETATTR: {
|
|
spa_t *spa = dmu_objset_spa(zv->zv_objset);
|
|
uint64_t refd, avail, usedobjs, availobjs, val;
|
|
|
|
if (g_handleattr_int(bp, "GEOM::candelete", 1))
|
|
return;
|
|
if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
&usedobjs, &availobjs);
|
|
if (g_handleattr_off_t(bp, "blocksavail",
|
|
avail / DEV_BSIZE))
|
|
return;
|
|
} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
&usedobjs, &availobjs);
|
|
if (g_handleattr_off_t(bp, "blocksused",
|
|
refd / DEV_BSIZE))
|
|
return;
|
|
} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
|
|
avail = metaslab_class_get_space(spa_normal_class(spa));
|
|
avail -= metaslab_class_get_alloc(spa_normal_class(spa));
|
|
if (g_handleattr_off_t(bp, "poolblocksavail",
|
|
avail / DEV_BSIZE))
|
|
return;
|
|
} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
|
|
refd = metaslab_class_get_alloc(spa_normal_class(spa));
|
|
if (g_handleattr_off_t(bp, "poolblocksused",
|
|
refd / DEV_BSIZE))
|
|
return;
|
|
}
|
|
/* FALLTHROUGH */
|
|
}
|
|
default:
|
|
g_io_deliver(bp, EOPNOTSUPP);
|
|
break;
|
|
}
|
|
return;
|
|
|
|
enqueue:
|
|
mtx_lock(&zv->zv_queue_mtx);
|
|
first = (bioq_first(&zv->zv_queue) == NULL);
|
|
bioq_insert_tail(&zv->zv_queue, bp);
|
|
mtx_unlock(&zv->zv_queue_mtx);
|
|
if (first)
|
|
wakeup_one(&zv->zv_queue);
|
|
}
|
|
|
|
static void
|
|
zvol_geom_worker(void *arg)
|
|
{
|
|
zvol_state_t *zv;
|
|
struct bio *bp;
|
|
|
|
thread_lock(curthread);
|
|
sched_prio(curthread, PRIBIO);
|
|
thread_unlock(curthread);
|
|
|
|
zv = arg;
|
|
for (;;) {
|
|
mtx_lock(&zv->zv_queue_mtx);
|
|
bp = bioq_takefirst(&zv->zv_queue);
|
|
if (bp == NULL) {
|
|
if (zv->zv_state == 1) {
|
|
zv->zv_state = 2;
|
|
wakeup(&zv->zv_state);
|
|
mtx_unlock(&zv->zv_queue_mtx);
|
|
kthread_exit();
|
|
}
|
|
msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
|
|
"zvol:io", 0);
|
|
continue;
|
|
}
|
|
mtx_unlock(&zv->zv_queue_mtx);
|
|
switch (bp->bio_cmd) {
|
|
case BIO_FLUSH:
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
g_io_deliver(bp, 0);
|
|
break;
|
|
case BIO_READ:
|
|
case BIO_WRITE:
|
|
case BIO_DELETE:
|
|
zvol_strategy(bp);
|
|
break;
|
|
default:
|
|
g_io_deliver(bp, EOPNOTSUPP);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
extern boolean_t dataset_name_hidden(const char *name);
|
|
|
|
static int
|
|
zvol_create_snapshots(objset_t *os, const char *name)
|
|
{
|
|
uint64_t cookie, obj;
|
|
char *sname;
|
|
int error, len;
|
|
|
|
cookie = obj = 0;
|
|
sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
|
|
|
|
#if 0
|
|
(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
|
|
DS_FIND_SNAPSHOTS);
|
|
#endif
|
|
|
|
for (;;) {
|
|
len = snprintf(sname, MAXPATHLEN, "%s@", name);
|
|
if (len >= MAXPATHLEN) {
|
|
dmu_objset_rele(os, FTAG);
|
|
error = ENAMETOOLONG;
|
|
break;
|
|
}
|
|
|
|
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
|
|
error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
|
|
sname + len, &obj, &cookie, NULL);
|
|
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
|
|
if (error != 0) {
|
|
if (error == ENOENT)
|
|
error = 0;
|
|
break;
|
|
}
|
|
|
|
error = zvol_create_minor(sname);
|
|
if (error != 0 && error != EEXIST) {
|
|
printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
|
|
sname, error);
|
|
break;
|
|
}
|
|
}
|
|
|
|
kmem_free(sname, MAXPATHLEN);
|
|
return (error);
|
|
}
|
|
|
|
int
|
|
zvol_create_minors(const char *name)
|
|
{
|
|
uint64_t cookie;
|
|
objset_t *os;
|
|
char *osname, *p;
|
|
int error, len;
|
|
|
|
if (dataset_name_hidden(name))
|
|
return (0);
|
|
|
|
if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
|
|
printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
|
|
name, error);
|
|
return (error);
|
|
}
|
|
if (dmu_objset_type(os) == DMU_OST_ZVOL) {
|
|
dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
|
|
dsl_pool_rele(dmu_objset_pool(os), FTAG);
|
|
error = zvol_create_minor(name);
|
|
if (error == 0 || error == EEXIST) {
|
|
error = zvol_create_snapshots(os, name);
|
|
} else {
|
|
printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
|
|
name, error);
|
|
}
|
|
dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
|
|
dsl_dataset_rele(os->os_dsl_dataset, FTAG);
|
|
return (error);
|
|
}
|
|
if (dmu_objset_type(os) != DMU_OST_ZFS) {
|
|
dmu_objset_rele(os, FTAG);
|
|
return (0);
|
|
}
|
|
|
|
osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
|
|
if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
|
|
dmu_objset_rele(os, FTAG);
|
|
kmem_free(osname, MAXPATHLEN);
|
|
return (ENOENT);
|
|
}
|
|
p = osname + strlen(osname);
|
|
len = MAXPATHLEN - (p - osname);
|
|
|
|
#if 0
|
|
/* Prefetch the datasets. */
|
|
cookie = 0;
|
|
while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
|
|
if (!dataset_name_hidden(osname))
|
|
(void) dmu_objset_prefetch(osname, NULL);
|
|
}
|
|
#endif
|
|
|
|
cookie = 0;
|
|
while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
|
|
&cookie) == 0) {
|
|
dmu_objset_rele(os, FTAG);
|
|
(void)zvol_create_minors(osname);
|
|
if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
|
|
printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
|
|
name, error);
|
|
return (error);
|
|
}
|
|
}
|
|
|
|
dmu_objset_rele(os, FTAG);
|
|
kmem_free(osname, MAXPATHLEN);
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
zvol_rename_minor(zvol_state_t *zv, const char *newname)
|
|
{
|
|
struct g_geom *gp;
|
|
struct g_provider *pp;
|
|
struct cdev *dev;
|
|
|
|
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
|
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
|
g_topology_lock();
|
|
pp = zv->zv_provider;
|
|
ASSERT(pp != NULL);
|
|
gp = pp->geom;
|
|
ASSERT(gp != NULL);
|
|
|
|
zv->zv_provider = NULL;
|
|
g_wither_provider(pp, ENXIO);
|
|
|
|
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
|
|
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
|
|
pp->sectorsize = DEV_BSIZE;
|
|
pp->mediasize = zv->zv_volsize;
|
|
pp->private = zv;
|
|
zv->zv_provider = pp;
|
|
g_error_provider(pp, 0);
|
|
g_topology_unlock();
|
|
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
|
struct make_dev_args args;
|
|
|
|
if ((dev = zv->zv_dev) != NULL) {
|
|
zv->zv_dev = NULL;
|
|
destroy_dev(dev);
|
|
if (zv->zv_total_opens > 0) {
|
|
zv->zv_flags &= ~ZVOL_EXCL;
|
|
zv->zv_total_opens = 0;
|
|
zvol_last_close(zv);
|
|
}
|
|
}
|
|
|
|
make_dev_args_init(&args);
|
|
args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
|
|
args.mda_devsw = &zvol_cdevsw;
|
|
args.mda_cr = NULL;
|
|
args.mda_uid = UID_ROOT;
|
|
args.mda_gid = GID_OPERATOR;
|
|
args.mda_mode = 0640;
|
|
args.mda_si_drv2 = zv;
|
|
if (make_dev_s(&args, &zv->zv_dev,
|
|
"%s/%s", ZVOL_DRIVER, newname) == 0)
|
|
zv->zv_dev->si_iosize_max = MAXPHYS;
|
|
}
|
|
strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
|
|
}
|
|
|
|
void
|
|
zvol_rename_minors(const char *oldname, const char *newname)
|
|
{
|
|
char name[MAXPATHLEN];
|
|
struct g_provider *pp;
|
|
struct g_geom *gp;
|
|
size_t oldnamelen, newnamelen;
|
|
zvol_state_t *zv;
|
|
char *namebuf;
|
|
boolean_t locked = B_FALSE;
|
|
|
|
oldnamelen = strlen(oldname);
|
|
newnamelen = strlen(newname);
|
|
|
|
DROP_GIANT();
|
|
/* See comment in zvol_open(). */
|
|
if (!MUTEX_HELD(&zfsdev_state_lock)) {
|
|
mutex_enter(&zfsdev_state_lock);
|
|
locked = B_TRUE;
|
|
}
|
|
|
|
LIST_FOREACH(zv, &all_zvols, zv_links) {
|
|
if (strcmp(zv->zv_name, oldname) == 0) {
|
|
zvol_rename_minor(zv, newname);
|
|
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
|
|
(zv->zv_name[oldnamelen] == '/' ||
|
|
zv->zv_name[oldnamelen] == '@')) {
|
|
snprintf(name, sizeof(name), "%s%c%s", newname,
|
|
zv->zv_name[oldnamelen],
|
|
zv->zv_name + oldnamelen + 1);
|
|
zvol_rename_minor(zv, name);
|
|
}
|
|
}
|
|
|
|
if (locked)
|
|
mutex_exit(&zfsdev_state_lock);
|
|
PICKUP_GIANT();
|
|
}
|
|
|
|
static int
|
|
zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
|
|
{
|
|
zvol_state_t *zv = dev->si_drv2;
|
|
int err = 0;
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
if (zv->zv_total_opens == 0)
|
|
err = zvol_first_open(zv);
|
|
if (err) {
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (err);
|
|
}
|
|
if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
|
|
err = SET_ERROR(EROFS);
|
|
goto out;
|
|
}
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
|
err = SET_ERROR(EBUSY);
|
|
goto out;
|
|
}
|
|
#ifdef FEXCL
|
|
if (flags & FEXCL) {
|
|
if (zv->zv_total_opens != 0) {
|
|
err = SET_ERROR(EBUSY);
|
|
goto out;
|
|
}
|
|
zv->zv_flags |= ZVOL_EXCL;
|
|
}
|
|
#endif
|
|
|
|
zv->zv_total_opens++;
|
|
if (flags & (FSYNC | FDSYNC)) {
|
|
zv->zv_sync_cnt++;
|
|
if (zv->zv_sync_cnt == 1)
|
|
zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
|
|
}
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (err);
|
|
out:
|
|
if (zv->zv_total_opens == 0)
|
|
zvol_last_close(zv);
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (err);
|
|
}
|
|
|
|
static int
|
|
zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
|
|
{
|
|
zvol_state_t *zv = dev->si_drv2;
|
|
|
|
mutex_enter(&zfsdev_state_lock);
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
|
ASSERT(zv->zv_total_opens == 1);
|
|
zv->zv_flags &= ~ZVOL_EXCL;
|
|
}
|
|
|
|
/*
|
|
* If the open count is zero, this is a spurious close.
|
|
* That indicates a bug in the kernel / DDI framework.
|
|
*/
|
|
ASSERT(zv->zv_total_opens != 0);
|
|
|
|
/*
|
|
* You may get multiple opens, but only one close.
|
|
*/
|
|
zv->zv_total_opens--;
|
|
if (flags & (FSYNC | FDSYNC))
|
|
zv->zv_sync_cnt--;
|
|
|
|
if (zv->zv_total_opens == 0)
|
|
zvol_last_close(zv);
|
|
|
|
mutex_exit(&zfsdev_state_lock);
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
|
|
{
|
|
zvol_state_t *zv;
|
|
rl_t *rl;
|
|
off_t offset, length;
|
|
int i, error;
|
|
boolean_t sync;
|
|
|
|
zv = dev->si_drv2;
|
|
|
|
error = 0;
|
|
KASSERT(zv->zv_total_opens > 0,
|
|
("Device with zero access count in zvol_d_ioctl"));
|
|
|
|
i = IOCPARM_LEN(cmd);
|
|
switch (cmd) {
|
|
case DIOCGSECTORSIZE:
|
|
*(u_int *)data = DEV_BSIZE;
|
|
break;
|
|
case DIOCGMEDIASIZE:
|
|
*(off_t *)data = zv->zv_volsize;
|
|
break;
|
|
case DIOCGFLUSH:
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
break;
|
|
case DIOCGDELETE:
|
|
if (!zvol_unmap_enabled)
|
|
break;
|
|
|
|
offset = ((off_t *)data)[0];
|
|
length = ((off_t *)data)[1];
|
|
if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
|
|
offset < 0 || offset >= zv->zv_volsize ||
|
|
length <= 0) {
|
|
printf("%s: offset=%jd length=%jd\n", __func__, offset,
|
|
length);
|
|
error = EINVAL;
|
|
break;
|
|
}
|
|
|
|
rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
|
|
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
if (error != 0) {
|
|
sync = FALSE;
|
|
dmu_tx_abort(tx);
|
|
} else {
|
|
sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
|
|
zvol_log_truncate(zv, tx, offset, length, sync);
|
|
dmu_tx_commit(tx);
|
|
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
|
|
offset, length);
|
|
}
|
|
zfs_range_unlock(rl);
|
|
if (sync)
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
break;
|
|
case DIOCGSTRIPESIZE:
|
|
*(off_t *)data = zv->zv_volblocksize;
|
|
break;
|
|
case DIOCGSTRIPEOFFSET:
|
|
*(off_t *)data = 0;
|
|
break;
|
|
case DIOCGATTR: {
|
|
spa_t *spa = dmu_objset_spa(zv->zv_objset);
|
|
struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
|
|
uint64_t refd, avail, usedobjs, availobjs;
|
|
|
|
if (strcmp(arg->name, "GEOM::candelete") == 0)
|
|
arg->value.i = 1;
|
|
else if (strcmp(arg->name, "blocksavail") == 0) {
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
&usedobjs, &availobjs);
|
|
arg->value.off = avail / DEV_BSIZE;
|
|
} else if (strcmp(arg->name, "blocksused") == 0) {
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
&usedobjs, &availobjs);
|
|
arg->value.off = refd / DEV_BSIZE;
|
|
} else if (strcmp(arg->name, "poolblocksavail") == 0) {
|
|
avail = metaslab_class_get_space(spa_normal_class(spa));
|
|
avail -= metaslab_class_get_alloc(spa_normal_class(spa));
|
|
arg->value.off = avail / DEV_BSIZE;
|
|
} else if (strcmp(arg->name, "poolblocksused") == 0) {
|
|
refd = metaslab_class_get_alloc(spa_normal_class(spa));
|
|
arg->value.off = refd / DEV_BSIZE;
|
|
} else
|
|
error = ENOIOCTL;
|
|
break;
|
|
}
|
|
case FIOSEEKHOLE:
|
|
case FIOSEEKDATA: {
|
|
off_t *off = (off_t *)data;
|
|
uint64_t noff;
|
|
boolean_t hole;
|
|
|
|
hole = (cmd == FIOSEEKHOLE);
|
|
noff = *off;
|
|
error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
|
|
*off = noff;
|
|
break;
|
|
}
|
|
default:
|
|
error = ENOIOCTL;
|
|
}
|
|
|
|
return (error);
|
|
}
|
|
#endif /* illumos */
|