freebsd-nq/sys/dev/xen/blkback/blkback.c
Attilio Rao 444b91868b Make the newbus subsystem Giant free by adding the new newbus sxlock.
The newbus lock is responsible for protecting newbus internIal structures,
device states and devclass flags. It is necessary to hold it when all
such datas are accessed. For the other operations, softc locking should
ensure enough protection to avoid races.

Newbus lock is automatically held when virtual operations on the device
and bus are invoked when loading the driver or when the suspend/resume
take place. For other 'spourious' operations trying to access/modify
the newbus topology, newbus lock needs to be automatically acquired and
dropped.

For the moment Giant is also acquired in some key point (modules subsystem)
in order to avoid problems before the 8.0 release as module handlers could
make assumptions about it. This Giant locking should go just after
the release happens.

Please keep in mind that the public interface can be expanded in order
to provide more support, if there are really necessities at some point
and also some bugs could arise as long as the patch needs a bit of
further testing.

Bump __FreeBSD_version in order to reflect the newbus lock introduction.

Reviewed by:    ed, hps, jhb, imp, mav, scottl
No answer by:   ariff, thompsa, yongari
Tested by:      pho,
                G. Trematerra <giovanni dot trematerra at gmail dot com>,
                Brandon Gooch <jamesbrandongooch at gmail dot com>
Sponsored by:   Yahoo! Incorporated
Approved by:	re (ksmith)
2009-08-02 14:28:40 +00:00

1350 lines
33 KiB
C

/*
* Copyright (c) 2006, Cisco Systems, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <sys/queue.h>
#include <sys/taskqueue.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/disk.h>
#include <sys/bio.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/sysctl.h>
#include <geom/geom.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <machine/xen-os.h>
#include <machine/hypervisor.h>
#include <machine/hypervisor-ifs.h>
#include <machine/xen_intr.h>
#include <machine/evtchn.h>
#include <machine/xenbus.h>
#include <machine/gnttab.h>
#include <machine/xen-public/memory.h>
#include <dev/xen/xenbus/xenbus_comms.h>
#if XEN_BLKBACK_DEBUG
#define DPRINTF(fmt, args...) \
printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
#else
#define DPRINTF(fmt, args...) ((void)0)
#endif
#define WPRINTF(fmt, args...) \
printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
#define BLKBACK_INVALID_HANDLE (~0)
struct ring_ref {
vm_offset_t va;
grant_handle_t handle;
uint64_t bus_addr;
};
typedef struct blkback_info {
/* Schedule lists */
STAILQ_ENTRY(blkback_info) next_req;
int on_req_sched_list;
struct xenbus_device *xdev;
XenbusState frontend_state;
domid_t domid;
int state;
int ring_connected;
struct ring_ref rr;
blkif_back_ring_t ring;
evtchn_port_t evtchn;
int irq;
void *irq_cookie;
int ref_cnt;
int handle;
char *mode;
char *type;
char *dev_name;
struct vnode *vn;
struct cdev *cdev;
struct cdevsw *csw;
u_int sector_size;
int sector_size_shift;
off_t media_size;
u_int media_num_sectors;
int major;
int minor;
int read_only;
struct mtx blk_ring_lock;
device_t ndev;
/* Stats */
int st_rd_req;
int st_wr_req;
int st_oo_req;
int st_err_req;
} blkif_t;
/*
* These are rather arbitrary. They are fairly large because adjacent requests
* pulled from a communication ring are quite likely to end up being part of
* the same scatter/gather request at the disc.
*
* ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
*
* This will increase the chances of being able to write whole tracks.
* 64 should be enough to keep us competitive with Linux.
*/
static int blkif_reqs = 64;
TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs);
static int mmap_pages;
/*
* Each outstanding request that we've passed to the lower device layers has a
* 'pending_req' allocated to it. Each buffer_head that completes decrements
* the pendcnt towards zero. When it hits zero, the specified domain has a
* response queued for it, with the saved 'id' passed back.
*/
typedef struct pending_req {
blkif_t *blkif;
uint64_t id;
int nr_pages;
int pendcnt;
unsigned short operation;
int status;
STAILQ_ENTRY(pending_req) free_list;
} pending_req_t;
static pending_req_t *pending_reqs;
static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free =
STAILQ_HEAD_INITIALIZER(pending_free);
static struct mtx pending_free_lock;
static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list =
STAILQ_HEAD_INITIALIZER(req_sched_list);
static struct mtx req_sched_list_lock;
static unsigned long mmap_vstart;
static unsigned long *pending_vaddrs;
static grant_handle_t *pending_grant_handles;
static struct task blk_req_task;
/* Protos */
static void disconnect_ring(blkif_t *blkif);
static int vbd_add_dev(struct xenbus_device *xdev);
static inline int vaddr_pagenr(pending_req_t *req, int seg)
{
return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
}
static inline unsigned long vaddr(pending_req_t *req, int seg)
{
return pending_vaddrs[vaddr_pagenr(req, seg)];
}
#define pending_handle(_req, _seg) \
(pending_grant_handles[vaddr_pagenr(_req, _seg)])
static unsigned long
alloc_empty_page_range(unsigned long nr_pages)
{
void *pages;
int i = 0, j = 0;
multicall_entry_t mcl[17];
unsigned long mfn_list[16];
struct xen_memory_reservation reservation = {
.extent_start = mfn_list,
.nr_extents = 0,
.address_bits = 0,
.extent_order = 0,
.domid = DOMID_SELF
};
pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT);
if (pages == NULL)
return 0;
memset(mcl, 0, sizeof(mcl));
while (i < nr_pages) {
unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE);
mcl[j].op = __HYPERVISOR_update_va_mapping;
mcl[j].args[0] = va;
mfn_list[j++] = vtomach(va) >> PAGE_SHIFT;
xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY;
if (j == 16 || i == nr_pages) {
mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL;
reservation.nr_extents = j;
mcl[j].op = __HYPERVISOR_memory_op;
mcl[j].args[0] = XENMEM_decrease_reservation;
mcl[j].args[1] = (unsigned long)&reservation;
(void)HYPERVISOR_multicall(mcl, j+1);
mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0;
j = 0;
}
}
return (unsigned long)pages;
}
static pending_req_t *
alloc_req(void)
{
pending_req_t *req;
mtx_lock(&pending_free_lock);
if ((req = STAILQ_FIRST(&pending_free))) {
STAILQ_REMOVE(&pending_free, req, pending_req, free_list);
STAILQ_NEXT(req, free_list) = NULL;
}
mtx_unlock(&pending_free_lock);
return req;
}
static void
free_req(pending_req_t *req)
{
int was_empty;
mtx_lock(&pending_free_lock);
was_empty = STAILQ_EMPTY(&pending_free);
STAILQ_INSERT_TAIL(&pending_free, req, free_list);
mtx_unlock(&pending_free_lock);
if (was_empty)
taskqueue_enqueue(taskqueue_swi, &blk_req_task);
}
static void
fast_flush_area(pending_req_t *req)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int i, invcount = 0;
grant_handle_t handle;
int ret;
for (i = 0; i < req->nr_pages; i++) {
handle = pending_handle(req, i);
if (handle == BLKBACK_INVALID_HANDLE)
continue;
unmap[invcount].host_addr = vaddr(req, i);
unmap[invcount].dev_bus_addr = 0;
unmap[invcount].handle = handle;
pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
invcount++;
}
ret = HYPERVISOR_grant_table_op(
GNTTABOP_unmap_grant_ref, unmap, invcount);
PANIC_IF(ret);
}
static void
blkif_get(blkif_t *blkif)
{
atomic_add_int(&blkif->ref_cnt, 1);
}
static void
blkif_put(blkif_t *blkif)
{
if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) {
DPRINTF("Removing %x\n", (unsigned int)blkif);
disconnect_ring(blkif);
if (blkif->mode)
free(blkif->mode, M_DEVBUF);
if (blkif->type)
free(blkif->type, M_DEVBUF);
if (blkif->dev_name)
free(blkif->dev_name, M_DEVBUF);
free(blkif, M_DEVBUF);
}
}
static int
blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params)
{
blkif_t *blkif;
blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO);
if (!blkif)
return ENOMEM;
DPRINTF("Created %x\n", (unsigned int)blkif);
blkif->ref_cnt = 1;
blkif->domid = xdev->otherend_id;
blkif->handle = handle;
blkif->mode = mode;
blkif->type = type;
blkif->dev_name = params;
blkif->xdev = xdev;
xdev->data = blkif;
mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF);
if (strcmp(mode, "w"))
blkif->read_only = 1;
return 0;
}
static void
add_to_req_schedule_list_tail(blkif_t *blkif)
{
if (!blkif->on_req_sched_list) {
mtx_lock(&req_sched_list_lock);
if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) {
blkif_get(blkif);
STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
blkif->on_req_sched_list = 1;
taskqueue_enqueue(taskqueue_swi, &blk_req_task);
}
mtx_unlock(&req_sched_list_lock);
}
}
/* This routine does not call blkif_get(), does not schedule the blk_req_task to run,
and assumes that the state is connected */
static void
add_to_req_schedule_list_tail2(blkif_t *blkif)
{
mtx_lock(&req_sched_list_lock);
if (!blkif->on_req_sched_list) {
STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
blkif->on_req_sched_list = 1;
}
mtx_unlock(&req_sched_list_lock);
}
/* Removes blkif from front of list and does not call blkif_put() (caller must) */
static blkif_t *
remove_from_req_schedule_list(void)
{
blkif_t *blkif;
mtx_lock(&req_sched_list_lock);
if ((blkif = STAILQ_FIRST(&req_sched_list))) {
STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req);
STAILQ_NEXT(blkif, next_req) = NULL;
blkif->on_req_sched_list = 0;
}
mtx_unlock(&req_sched_list_lock);
return blkif;
}
static void
make_response(blkif_t *blkif, uint64_t id,
unsigned short op, int st)
{
blkif_response_t *resp;
blkif_back_ring_t *blk_ring = &blkif->ring;
int more_to_do = 0;
int notify;
mtx_lock(&blkif->blk_ring_lock);
/* Place on the response ring for the relevant domain. */
resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
resp->id = id;
resp->operation = op;
resp->status = st;
blk_ring->rsp_prod_pvt++;
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
/*
* Tail check for pending requests. Allows frontend to avoid
* notifications if requests are already in flight (lower
* overheads and promotes batching).
*/
RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
} else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring))
more_to_do = 1;
mtx_unlock(&blkif->blk_ring_lock);
if (more_to_do)
add_to_req_schedule_list_tail(blkif);
if (notify)
notify_remote_via_irq(blkif->irq);
}
static void
end_block_io_op(struct bio *bio)
{
pending_req_t *pending_req = bio->bio_caller2;
if (bio->bio_error) {
DPRINTF("BIO returned error %d for operation on device %s\n",
bio->bio_error, pending_req->blkif->dev_name);
pending_req->status = BLKIF_RSP_ERROR;
pending_req->blkif->st_err_req++;
}
#if 0
printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n",
(unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags);
#endif
if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) {
fast_flush_area(pending_req);
make_response(pending_req->blkif, pending_req->id,
pending_req->operation, pending_req->status);
blkif_put(pending_req->blkif);
free_req(pending_req);
}
g_destroy_bio(bio);
}
static void
dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req)
{
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct {
unsigned long buf; unsigned int nsec;
} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int nseg = req->nr_segments, nr_sects = 0;
struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
int operation, ret, i, nbio = 0;
/* Check that number of segments is sane. */
if (unlikely(nseg == 0) ||
unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
DPRINTF("Bad number of segments in request (%d)\n", nseg);
goto fail_response;
}
if (req->operation == BLKIF_OP_WRITE) {
if (blkif->read_only) {
DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name);
goto fail_response;
}
operation = BIO_WRITE;
} else
operation = BIO_READ;
pending_req->blkif = blkif;
pending_req->id = req->id;
pending_req->operation = req->operation;
pending_req->status = BLKIF_RSP_OKAY;
pending_req->nr_pages = nseg;
for (i = 0; i < nseg; i++) {
seg[i].nsec = req->seg[i].last_sect -
req->seg[i].first_sect + 1;
if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
(seg[i].nsec <= 0))
goto fail_response;
nr_sects += seg[i].nsec;
map[i].host_addr = vaddr(pending_req, i);
map[i].dom = blkif->domid;
map[i].ref = req->seg[i].gref;
map[i].flags = GNTMAP_host_map;
if (operation == BIO_WRITE)
map[i].flags |= GNTMAP_readonly;
}
/* Convert to the disk's sector size */
nr_sects = (nr_sects << 9) >> blkif->sector_size_shift;
ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
PANIC_IF(ret);
for (i = 0; i < nseg; i++) {
if (unlikely(map[i].status != 0)) {
DPRINTF("invalid buffer -- could not remap it\n");
goto fail_flush;
}
pending_handle(pending_req, i) = map[i].handle;
#if 0
/* Can't do this in FreeBSD since vtophys() returns the pfn */
/* of the remote domain who loaned us the machine page - DPT */
xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] =
map[i]dev_bus_addr >> PAGE_SHIFT;
#endif
seg[i].buf = map[i].dev_bus_addr |
(req->seg[i].first_sect << 9);
}
if (req->sector_number + nr_sects > blkif->media_num_sectors) {
DPRINTF("%s of [%llu,%llu] extends past end of device %s\n",
operation == BIO_READ ? "read" : "write",
req->sector_number,
req->sector_number + nr_sects, blkif->dev_name);
goto fail_flush;
}
for (i = 0; i < nseg; i++) {
struct bio *bio;
if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) {
DPRINTF("Misaligned I/O request from domain %d", blkif->domid);
goto fail_put_bio;
}
bio = biolist[nbio++] = g_new_bio();
if (unlikely(bio == NULL))
goto fail_put_bio;
bio->bio_cmd = operation;
bio->bio_offset = req->sector_number << blkif->sector_size_shift;
bio->bio_length = seg[i].nsec << 9;
bio->bio_bcount = bio->bio_length;
bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK));
bio->bio_done = end_block_io_op;
bio->bio_caller2 = pending_req;
bio->bio_dev = blkif->cdev;
req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift;
#if 0
printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n",
(unsigned int)bio, req->operation, req->sector_number, seg[i].nsec,
blkif->cdev->si_iosize_max, seg[i].buf);
#endif
}
pending_req->pendcnt = nbio;
blkif_get(blkif);
for (i = 0; i < nbio; i++)
(*blkif->csw->d_strategy)(biolist[i]);
return;
fail_put_bio:
for (i = 0; i < (nbio-1); i++)
g_destroy_bio(biolist[i]);
fail_flush:
fast_flush_area(pending_req);
fail_response:
make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
free_req(pending_req);
}
static void
blk_req_action(void *context, int pending)
{
blkif_t *blkif;
DPRINTF("\n");
while (!STAILQ_EMPTY(&req_sched_list)) {
blkif_back_ring_t *blk_ring;
RING_IDX rc, rp;
blkif = remove_from_req_schedule_list();
blk_ring = &blkif->ring;
rc = blk_ring->req_cons;
rp = blk_ring->sring->req_prod;
rmb(); /* Ensure we see queued requests up to 'rp'. */
while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
blkif_request_t *req;
pending_req_t *pending_req;
pending_req = alloc_req();
if (pending_req == NULL)
goto out_of_preqs;
req = RING_GET_REQUEST(blk_ring, rc);
blk_ring->req_cons = ++rc; /* before make_response() */
switch (req->operation) {
case BLKIF_OP_READ:
blkif->st_rd_req++;
dispatch_rw_block_io(blkif, req, pending_req);
break;
case BLKIF_OP_WRITE:
blkif->st_wr_req++;
dispatch_rw_block_io(blkif, req, pending_req);
break;
default:
blkif->st_err_req++;
DPRINTF("error: unknown block io operation [%d]\n",
req->operation);
make_response(blkif, req->id, req->operation,
BLKIF_RSP_ERROR);
free_req(pending_req);
break;
}
}
blkif_put(blkif);
}
return;
out_of_preqs:
/* We ran out of pending req structs */
/* Just requeue interface and wait to be rescheduled to run when one is freed */
add_to_req_schedule_list_tail2(blkif);
blkif->st_oo_req++;
}
/* Handle interrupt from a frontend */
static void
blkback_intr(void *arg)
{
blkif_t *blkif = arg;
DPRINTF("%x\n", (unsigned int)blkif);
add_to_req_schedule_list_tail(blkif);
}
/* Map grant ref for ring */
static int
map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring)
{
struct gnttab_map_grant_ref op;
ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
if (ring->va == 0)
return ENOMEM;
op.host_addr = ring->va;
op.flags = GNTMAP_host_map;
op.ref = ref;
op.dom = dom;
HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
if (op.status) {
WPRINTF("grant table op err=%d\n", op.status);
kmem_free(kernel_map, ring->va, PAGE_SIZE);
ring->va = 0;
return EACCES;
}
ring->handle = op.handle;
ring->bus_addr = op.dev_bus_addr;
return 0;
}
/* Unmap grant ref for ring */
static void
unmap_ring(struct ring_ref *ring)
{
struct gnttab_unmap_grant_ref op;
op.host_addr = ring->va;
op.dev_bus_addr = ring->bus_addr;
op.handle = ring->handle;
HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
if (op.status)
WPRINTF("grant table op err=%d\n", op.status);
kmem_free(kernel_map, ring->va, PAGE_SIZE);
ring->va = 0;
}
static int
connect_ring(blkif_t *blkif)
{
struct xenbus_device *xdev = blkif->xdev;
blkif_sring_t *ring;
unsigned long ring_ref;
evtchn_port_t evtchn;
evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain };
int err;
if (blkif->ring_connected)
return 0;
// Grab FE data and map his memory
err = xenbus_gather(NULL, xdev->otherend,
"ring-ref", "%lu", &ring_ref,
"event-channel", "%u", &evtchn, NULL);
if (err) {
xenbus_dev_fatal(xdev, err,
"reading %s/ring-ref and event-channel",
xdev->otherend);
return err;
}
err = map_ring(ring_ref, blkif->domid, &blkif->rr);
if (err) {
xenbus_dev_fatal(xdev, err, "mapping ring");
return err;
}
ring = (blkif_sring_t *)blkif->rr.va;
BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE);
op.u.bind_interdomain.remote_dom = blkif->domid;
op.u.bind_interdomain.remote_port = evtchn;
err = HYPERVISOR_event_channel_op(&op);
if (err) {
unmap_ring(&blkif->rr);
xenbus_dev_fatal(xdev, err, "binding event channel");
return err;
}
blkif->evtchn = op.u.bind_interdomain.local_port;
/* bind evtchn to irq handler */
blkif->irq =
bind_evtchn_to_irqhandler(blkif->evtchn, "blkback",
blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie);
blkif->ring_connected = 1;
DPRINTF("%x rings connected! evtchn=%d irq=%d\n",
(unsigned int)blkif, blkif->evtchn, blkif->irq);
return 0;
}
static void
disconnect_ring(blkif_t *blkif)
{
DPRINTF("\n");
if (blkif->ring_connected) {
unbind_from_irqhandler(blkif->irq, blkif->irq_cookie);
blkif->irq = 0;
unmap_ring(&blkif->rr);
blkif->ring_connected = 0;
}
}
static void
connect(blkif_t *blkif)
{
struct xenbus_transaction *xbt;
struct xenbus_device *xdev = blkif->xdev;
int err;
if (!blkif->ring_connected ||
blkif->vn == NULL ||
blkif->state == XenbusStateConnected)
return;
DPRINTF("%s\n", xdev->otherend);
/* Supply the information about the device the frontend needs */
again:
xbt = xenbus_transaction_start();
if (IS_ERR(xbt)) {
xenbus_dev_fatal(xdev, PTR_ERR(xbt),
"Error writing configuration for backend "
"(start transaction)");
return;
}
err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u",
blkif->media_num_sectors);
if (err) {
xenbus_dev_fatal(xdev, err, "writing %s/sectors",
xdev->nodename);
goto abort;
}
err = xenbus_printf(xbt, xdev->nodename, "info", "%u",
blkif->read_only ? VDISK_READONLY : 0);
if (err) {
xenbus_dev_fatal(xdev, err, "writing %s/info",
xdev->nodename);
goto abort;
}
err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u",
blkif->sector_size);
if (err) {
xenbus_dev_fatal(xdev, err, "writing %s/sector-size",
xdev->nodename);
goto abort;
}
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN)
goto again;
if (err)
xenbus_dev_fatal(xdev, err, "ending transaction");
err = xenbus_switch_state(xdev, NULL, XenbusStateConnected);
if (err)
xenbus_dev_fatal(xdev, err, "switching to Connected state",
xdev->nodename);
blkif->state = XenbusStateConnected;
return;
abort:
xenbus_transaction_end(xbt, 1);
}
static int
blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id)
{
int err;
char *p, *mode = NULL, *type = NULL, *params = NULL;
long handle;
DPRINTF("node=%s\n", xdev->nodename);
p = strrchr(xdev->otherend, '/') + 1;
handle = strtoul(p, NULL, 0);
mode = xenbus_read(NULL, xdev->nodename, "mode", NULL);
if (IS_ERR(mode)) {
xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode");
err = PTR_ERR(mode);
goto error;
}
type = xenbus_read(NULL, xdev->nodename, "type", NULL);
if (IS_ERR(type)) {
xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type");
err = PTR_ERR(type);
goto error;
}
params = xenbus_read(NULL, xdev->nodename, "params", NULL);
if (IS_ERR(type)) {
xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params");
err = PTR_ERR(params);
goto error;
}
err = blkif_create(xdev, handle, mode, type, params);
if (err) {
xenbus_dev_fatal(xdev, err, "creating blkif");
goto error;
}
err = vbd_add_dev(xdev);
if (err) {
blkif_put((blkif_t *)xdev->data);
xenbus_dev_fatal(xdev, err, "adding vbd device");
}
return err;
error:
if (mode)
free(mode, M_DEVBUF);
if (type)
free(type, M_DEVBUF);
if (params)
free(params, M_DEVBUF);
return err;
}
static int
blkback_remove(struct xenbus_device *xdev)
{
blkif_t *blkif = xdev->data;
device_t ndev;
DPRINTF("node=%s\n", xdev->nodename);
blkif->state = XenbusStateClosing;
if ((ndev = blkif->ndev)) {
blkif->ndev = NULL;
mtx_lock(&Giant);
device_detach(ndev);
mtx_unlock(&Giant);
}
xdev->data = NULL;
blkif->xdev = NULL;
blkif_put(blkif);
return 0;
}
static int
blkback_resume(struct xenbus_device *xdev)
{
DPRINTF("node=%s\n", xdev->nodename);
return 0;
}
static void
frontend_changed(struct xenbus_device *xdev,
XenbusState frontend_state)
{
blkif_t *blkif = xdev->data;
DPRINTF("state=%d\n", frontend_state);
blkif->frontend_state = frontend_state;
switch (frontend_state) {
case XenbusStateInitialising:
break;
case XenbusStateInitialised:
case XenbusStateConnected:
connect_ring(blkif);
connect(blkif);
break;
case XenbusStateClosing:
xenbus_switch_state(xdev, NULL, XenbusStateClosing);
break;
case XenbusStateClosed:
xenbus_remove_device(xdev);
break;
case XenbusStateUnknown:
case XenbusStateInitWait:
xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend",
frontend_state);
break;
}
}
/* ** Driver registration ** */
static struct xenbus_device_id blkback_ids[] = {
{ "vbd" },
{ "" }
};
static struct xenbus_driver blkback = {
.name = "blkback",
.ids = blkback_ids,
.probe = blkback_probe,
.remove = blkback_remove,
.resume = blkback_resume,
.otherend_changed = frontend_changed,
};
static void
blkback_init(void *unused)
{
int i;
TASK_INIT(&blk_req_task, 0, blk_req_action, NULL);
mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF);
mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF);
mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
pending_reqs = malloc(sizeof(pending_reqs[0]) *
blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT);
pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) *
mmap_pages, M_DEVBUF, M_NOWAIT);
pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) *
mmap_pages, M_DEVBUF, M_NOWAIT);
mmap_vstart = alloc_empty_page_range(mmap_pages);
if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) {
if (pending_reqs)
free(pending_reqs, M_DEVBUF);
if (pending_grant_handles)
free(pending_grant_handles, M_DEVBUF);
if (pending_vaddrs)
free(pending_vaddrs, M_DEVBUF);
WPRINTF("out of memory\n");
return;
}
for (i = 0; i < mmap_pages; i++) {
pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
}
for (i = 0; i < blkif_reqs; i++) {
STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list);
}
DPRINTF("registering %s\n", blkback.name);
xenbus_register_backend(&blkback);
}
SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL)
static void
close_device(blkif_t *blkif)
{
DPRINTF("closing dev=%s\n", blkif->dev_name);
if (blkif->vn) {
int flags = FREAD;
if (!blkif->read_only)
flags |= FWRITE;
if (blkif->csw) {
dev_relthread(blkif->cdev);
blkif->csw = NULL;
}
(void)vn_close(blkif->vn, flags, NOCRED, curthread);
blkif->vn = NULL;
}
}
static int
open_device(blkif_t *blkif)
{
struct nameidata nd;
struct vattr vattr;
struct cdev *dev;
struct cdevsw *devsw;
int flags = FREAD, err = 0;
DPRINTF("opening dev=%s\n", blkif->dev_name);
if (!blkif->read_only)
flags |= FWRITE;
if (!curthread->td_proc->p_fd->fd_cdir) {
curthread->td_proc->p_fd->fd_cdir = rootvnode;
VREF(rootvnode);
}
if (!curthread->td_proc->p_fd->fd_rdir) {
curthread->td_proc->p_fd->fd_rdir = rootvnode;
VREF(rootvnode);
}
if (!curthread->td_proc->p_fd->fd_jdir) {
curthread->td_proc->p_fd->fd_jdir = rootvnode;
VREF(rootvnode);
}
again:
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread);
err = vn_open(&nd, &flags, 0, -1);
if (err) {
if (blkif->dev_name[0] != '/') {
char *dev_path = "/dev/";
char *dev_name;
/* Try adding device path at beginning of name */
dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT);
if (dev_name) {
sprintf(dev_name, "%s%s", dev_path, blkif->dev_name);
free(blkif->dev_name, M_DEVBUF);
blkif->dev_name = dev_name;
goto again;
}
}
xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name);
return err;
}
NDFREE(&nd, NDF_ONLY_PNBUF);
blkif->vn = nd.ni_vp;
/* We only support disks for now */
if (!vn_isdisk(blkif->vn, &err)) {
xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name);
VOP_UNLOCK(blkif->vn, 0, curthread);
goto error;
}
blkif->cdev = blkif->vn->v_rdev;
blkif->csw = dev_refthread(blkif->cdev);
PANIC_IF(blkif->csw == NULL);
err = VOP_GETATTR(blkif->vn, &vattr, NOCRED);
if (err) {
xenbus_dev_fatal(blkif->xdev, err,
"error getting vnode attributes for device %s", blkif->dev_name);
VOP_UNLOCK(blkif->vn, 0, curthread);
goto error;
}
VOP_UNLOCK(blkif->vn, 0, curthread);
dev = blkif->vn->v_rdev;
devsw = dev->si_devsw;
if (!devsw->d_ioctl) {
err = ENODEV;
xenbus_dev_fatal(blkif->xdev, err,
"no d_ioctl for device %s!", blkif->dev_name);
goto error;
}
err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread);
if (err) {
xenbus_dev_fatal(blkif->xdev, err,
"error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name);
goto error;
}
blkif->sector_size_shift = fls(blkif->sector_size) - 1;
err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread);
if (err) {
xenbus_dev_fatal(blkif->xdev, err,
"error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name);
goto error;
}
blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift;
blkif->major = major(vattr.va_rdev);
blkif->minor = minor(vattr.va_rdev);
DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n",
blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size);
return 0;
error:
close_device(blkif);
return err;
}
static int
vbd_add_dev(struct xenbus_device *xdev)
{
blkif_t *blkif = xdev->data;
device_t nexus, ndev;
devclass_t dc;
int err = 0;
newbus_xlock();
/* We will add a vbd device as a child of nexus0 (for now) */
if (!(dc = devclass_find("nexus")) ||
!(nexus = devclass_get_device(dc, 0))) {
WPRINTF("could not find nexus0!\n");
err = ENOENT;
goto done;
}
/* Create a newbus device representing the vbd */
ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle);
if (!ndev) {
WPRINTF("could not create newbus device vbd%d!\n", blkif->handle);
err = EFAULT;
goto done;
}
blkif_get(blkif);
device_set_ivars(ndev, blkif);
blkif->ndev = ndev;
device_probe_and_attach(ndev);
done:
newbus_xunlock();
return err;
}
enum {
VBD_SYSCTL_DOMID,
VBD_SYSCTL_ST_RD_REQ,
VBD_SYSCTL_ST_WR_REQ,
VBD_SYSCTL_ST_OO_REQ,
VBD_SYSCTL_ST_ERR_REQ,
VBD_SYSCTL_RING,
};
static char *
vbd_sysctl_ring_info(blkif_t *blkif, int cmd)
{
char *buf = malloc(256, M_DEVBUF, M_WAITOK);
if (buf) {
if (!blkif->ring_connected)
sprintf(buf, "ring not connected\n");
else {
blkif_back_ring_t *ring = &blkif->ring;
sprintf(buf, "nr_ents=%x req_cons=%x"
" req_prod=%x req_event=%x"
" rsp_prod=%x rsp_event=%x",
ring->nr_ents, ring->req_cons,
ring->sring->req_prod, ring->sring->req_event,
ring->sring->rsp_prod, ring->sring->rsp_event);
}
}
return buf;
}
static int
vbd_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
device_t dev = (device_t)arg1;
blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
const char *value;
char *buf = NULL;
int err;
switch (arg2) {
case VBD_SYSCTL_DOMID:
return sysctl_handle_int(oidp, NULL, blkif->domid, req);
case VBD_SYSCTL_ST_RD_REQ:
return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req);
case VBD_SYSCTL_ST_WR_REQ:
return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req);
case VBD_SYSCTL_ST_OO_REQ:
return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req);
case VBD_SYSCTL_ST_ERR_REQ:
return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req);
case VBD_SYSCTL_RING:
value = buf = vbd_sysctl_ring_info(blkif, arg2);
break;
default:
return (EINVAL);
}
err = SYSCTL_OUT(req, value, strlen(value));
if (buf != NULL)
free(buf, M_DEVBUF);
return err;
}
/* Newbus vbd device driver probe */
static int
vbd_probe(device_t dev)
{
DPRINTF("vbd%d\n", device_get_unit(dev));
return 0;
}
/* Newbus vbd device driver attach */
static int
vbd_attach(device_t dev)
{
blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
DPRINTF("%s\n", blkif->dev_name);
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD,
dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I",
"domid of frontend");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD,
dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I",
"number of read reqs");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD,
dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I",
"number of write reqs");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD,
dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I",
"number of deferred reqs");
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD,
dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I",
"number of reqs that returned error");
#if XEN_BLKBACK_DEBUG
SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
OID_AUTO, "ring", CTLFLAG_RD,
dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A",
"req ring info");
#endif
if (!open_device(blkif))
connect(blkif);
return bus_generic_attach(dev);
}
/* Newbus vbd device driver detach */
static int
vbd_detach(device_t dev)
{
blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
DPRINTF("%s\n", blkif->dev_name);
close_device(blkif);
bus_generic_detach(dev);
blkif_put(blkif);
return 0;
}
static device_method_t vbd_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, vbd_probe),
DEVMETHOD(device_attach, vbd_attach),
DEVMETHOD(device_detach, vbd_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD(device_suspend, bus_generic_suspend),
DEVMETHOD(device_resume, bus_generic_resume),
{0, 0}
};
static devclass_t vbd_devclass;
static driver_t vbd_driver = {
"vbd",
vbd_methods,
0,
};
DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0);
/*
* Local variables:
* mode: C
* c-set-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: t
* End:
*/