freebsd-nq/sys/xen/xenstore/xenstore.c
Justin T. Gibbs 2ca7463bc7 Properly handle suspend/resume events in the Xen device
framework.

Sponsored by:	BQ Internet

sys/xen/xenbus/xenbusb.c:
	o In xenbusb_resume(), publish the state transition of the
	  resuming device into XenbusStateIntiailising so that the
	  remote peer can see it.  Recording the state locally is
	  not sufficient to trigger a re-connect sequence.
	o In xenbusb_resume(), defer new-bus resume processing until
	  after the remote peer's XenStore address has been updated.
	  The drivers may need to refer to this information during
	  resume processing.

sys/xen/xenbus/xenbusb_back.c:
sys/xen/xenbus/xenbusb_front.c:
	Register xenbusb_resume() rather than bus_generic_resume()
	as the handler for device_resume events.

sys/xen/xenstore/xenstore.c:
	o Fix grammer in a comment.
	o In xs_suspend(), pass suspend events on to the child
	  devices (e.g. xenbusb_front/back, that are attached
	  to the XenStore.

Approved by:	re
MFC after:	1 week
2011-09-20 23:44:34 +00:00

1663 lines
42 KiB
C

/******************************************************************************
* xenstore.c
*
* Low-level kernel interface to the XenStore.
*
* Copyright (C) 2005 Rusty Russell, IBM Corporation
* Copyright (C) 2009,2010 Spectra Logic Corporation
*
* This file may be distributed separately from the Linux kernel, or
* incorporated into other software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/syslog.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/uio.h>
#include <sys/unistd.h>
#include <machine/xen/xen-os.h>
#include <machine/stdarg.h>
#include <xen/evtchn.h>
#include <xen/gnttab.h>
#include <xen/hypervisor.h>
#include <xen/xen_intr.h>
#include <xen/interface/hvm/params.h>
#include <xen/xenstore/xenstorevar.h>
#include <xen/xenstore/xenstore_internal.h>
#include <vm/vm.h>
#include <vm/pmap.h>
/**
* \file xenstore.c
* \brief XenStore interface
*
* The XenStore interface is a simple storage system that is a means of
* communicating state and configuration data between the Xen Domain 0
* and the various guest domains. All configuration data other than
* a small amount of essential information required during the early
* boot process of launching a Xen aware guest, is managed using the
* XenStore.
*
* The XenStore is ASCII string based, and has a structure and semantics
* similar to a filesystem. There are files and directories, the directories
* able to contain files or other directories. The depth of the hierachy
* is only limited by the XenStore's maximum path length.
*
* The communication channel between the XenStore service and other
* domains is via two, guest specific, ring buffers in a shared memory
* area. One ring buffer is used for communicating in each direction.
* The grant table references for this shared memory are given to the
* guest either via the xen_start_info structure for a fully para-
* virtualized guest, or via HVM hypercalls for a hardware virtualized
* guest.
*
* The XenStore communication relies on an event channel and thus
* interrupts. For this reason, the attachment of the XenStore
* relies on an interrupt driven configuration hook to hold off
* boot processing until communication with the XenStore service
* can be established.
*
* Several Xen services depend on the XenStore, most notably the
* XenBus used to discover and manage Xen devices. These services
* are implemented as NewBus child attachments to a bus exported
* by this XenStore driver.
*/
static struct xs_watch *find_watch(const char *token);
MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results");
/**
* Pointer to shared memory communication structures allowing us
* to communicate with the XenStore service.
*
* When operating in full PV mode, this pointer is set early in kernel
* startup from within xen_machdep.c. In HVM mode, we use hypercalls
* to get the guest frame number for the shared page and then map it
* into kva. See xs_init() for details.
*/
struct xenstore_domain_interface *xen_store;
/*-------------------------- Private Data Structures ------------------------*/
/**
* Structure capturing messages received from the XenStore service.
*/
struct xs_stored_msg {
TAILQ_ENTRY(xs_stored_msg) list;
struct xsd_sockmsg hdr;
union {
/* Queued replies. */
struct {
char *body;
} reply;
/* Queued watch events. */
struct {
struct xs_watch *handle;
const char **vec;
u_int vec_size;
} watch;
} u;
};
TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg);
/**
* Container for all XenStore related state.
*/
struct xs_softc {
/** Newbus device for the XenStore. */
device_t xs_dev;
/**
* Lock serializing access to ring producer/consumer
* indexes. Use of this lock guarantees that wakeups
* of blocking readers/writers are not missed due to
* races with the XenStore service.
*/
struct mtx ring_lock;
/*
* Mutex used to insure exclusive access to the outgoing
* communication ring. We use a lock type that can be
* held while sleeping so that xs_write() can block waiting
* for space in the ring to free up, without allowing another
* writer to come in and corrupt a partial message write.
*/
struct sx request_mutex;
/**
* A list of replies to our requests.
*
* The reply list is filled by xs_rcv_thread(). It
* is consumed by the context that issued the request
* to which a reply is made. The requester blocks in
* xs_read_reply().
*
* /note Only one requesting context can be active at a time.
* This is guaranteed by the request_mutex and insures
* that the requester sees replies matching the order
* of its requests.
*/
struct xs_stored_msg_list reply_list;
/** Lock protecting the reply list. */
struct mtx reply_lock;
/**
* List of registered watches.
*/
struct xs_watch_list registered_watches;
/** Lock protecting the registered watches list. */
struct mtx registered_watches_lock;
/**
* List of pending watch callback events.
*/
struct xs_stored_msg_list watch_events;
/** Lock protecting the watch calback list. */
struct mtx watch_events_lock;
/**
* Sleepable lock used to prevent VM suspension while a
* xenstore transaction is outstanding.
*
* Each active transaction holds a shared lock on the
* suspend mutex. Our suspend method blocks waiting
* to acquire an exclusive lock. This guarantees that
* suspend processing will only proceed once all active
* transactions have been retired.
*/
struct sx suspend_mutex;
/**
* The processid of the xenwatch thread.
*/
pid_t xenwatch_pid;
/**
* Sleepable mutex used to gate the execution of XenStore
* watch event callbacks.
*
* xenwatch_thread holds an exclusive lock on this mutex
* while delivering event callbacks, and xenstore_unregister_watch()
* uses an exclusive lock of this mutex to guarantee that no
* callbacks of the just unregistered watch are pending
* before returning to its caller.
*/
struct sx xenwatch_mutex;
#ifdef XENHVM
/**
* The HVM guest pseudo-physical frame number. This is Xen's mapping
* of the true machine frame number into our "physical address space".
*/
unsigned long gpfn;
#endif
/**
* The event channel for communicating with the
* XenStore service.
*/
int evtchn;
/** Interrupt number for our event channel. */
u_int irq;
/**
* Interrupt driven config hook allowing us to defer
* attaching children until interrupts (and thus communication
* with the XenStore service) are available.
*/
struct intr_config_hook xs_attachcb;
};
/*-------------------------------- Global Data ------------------------------*/
static struct xs_softc xs;
/*------------------------- Private Utility Functions -----------------------*/
/**
* Count and optionally record pointers to a number of NUL terminated
* strings in a buffer.
*
* \param strings A pointer to a contiguous buffer of NUL terminated strings.
* \param dest An array to store pointers to each string found in strings.
* \param len The length of the buffer pointed to by strings.
*
* \return A count of the number of strings found.
*/
static u_int
extract_strings(const char *strings, const char **dest, u_int len)
{
u_int num;
const char *p;
for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) {
if (dest != NULL)
*dest++ = p;
num++;
}
return (num);
}
/**
* Convert a contiguous buffer containing a series of NUL terminated
* strings into an array of pointers to strings.
*
* The returned pointer references the array of string pointers which
* is followed by the storage for the string data. It is the client's
* responsibility to free this storage.
*
* The storage addressed by strings is free'd prior to split returning.
*
* \param strings A pointer to a contiguous buffer of NUL terminated strings.
* \param len The length of the buffer pointed to by strings.
* \param num The number of strings found and returned in the strings
* array.
*
* \return An array of pointers to the strings found in the input buffer.
*/
static const char **
split(char *strings, u_int len, u_int *num)
{
const char **ret;
/* Protect against unterminated buffers. */
strings[len - 1] = '\0';
/* Count the strings. */
*num = extract_strings(strings, /*dest*/NULL, len);
/* Transfer to one big alloc for easy freeing by the caller. */
ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK);
memcpy(&ret[*num], strings, len);
free(strings, M_XENSTORE);
/* Extract pointers to newly allocated array. */
strings = (char *)&ret[*num];
(void)extract_strings(strings, /*dest*/ret, len);
return (ret);
}
/*------------------------- Public Utility Functions -------------------------*/
/*------- API comments for these methods can be found in xenstorevar.h -------*/
struct sbuf *
xs_join(const char *dir, const char *name)
{
struct sbuf *sb;
sb = sbuf_new_auto();
sbuf_cat(sb, dir);
if (name[0] != '\0') {
sbuf_putc(sb, '/');
sbuf_cat(sb, name);
}
sbuf_finish(sb);
return (sb);
}
/*-------------------- Low Level Communication Management --------------------*/
/**
* Interrupt handler for the XenStore event channel.
*
* XenStore reads and writes block on "xen_store" for buffer
* space. Wakeup any blocking operations when the XenStore
* service has modified the queues.
*/
static void
xs_intr(void * arg __unused /*__attribute__((unused))*/)
{
/*
* Hold ring lock across wakeup so that clients
* cannot miss a wakeup.
*/
mtx_lock(&xs.ring_lock);
wakeup(xen_store);
mtx_unlock(&xs.ring_lock);
}
/**
* Verify that the indexes for a ring are valid.
*
* The difference between the producer and consumer cannot
* exceed the size of the ring.
*
* \param cons The consumer index for the ring to test.
* \param prod The producer index for the ring to test.
*
* \retval 1 If indexes are in range.
* \retval 0 If the indexes are out of range.
*/
static int
xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
{
return ((prod - cons) <= XENSTORE_RING_SIZE);
}
/**
* Return a pointer to, and the length of, the contiguous
* free region available for output in a ring buffer.
*
* \param cons The consumer index for the ring.
* \param prod The producer index for the ring.
* \param buf The base address of the ring's storage.
* \param len The amount of contiguous storage available.
*
* \return A pointer to the start location of the free region.
*/
static void *
xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
char *buf, uint32_t *len)
{
*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
*len = XENSTORE_RING_SIZE - (prod - cons);
return (buf + MASK_XENSTORE_IDX(prod));
}
/**
* Return a pointer to, and the length of, the contiguous
* data available to read from a ring buffer.
*
* \param cons The consumer index for the ring.
* \param prod The producer index for the ring.
* \param buf The base address of the ring's storage.
* \param len The amount of contiguous data available to read.
*
* \return A pointer to the start location of the available data.
*/
static const void *
xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
const char *buf, uint32_t *len)
{
*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
if ((prod - cons) < *len)
*len = prod - cons;
return (buf + MASK_XENSTORE_IDX(cons));
}
/**
* Transmit data to the XenStore service.
*
* \param tdata A pointer to the contiguous data to send.
* \param len The amount of data to send.
*
* \return On success 0, otherwise an errno value indicating the
* cause of failure.
*
* \invariant Called from thread context.
* \invariant The buffer pointed to by tdata is at least len bytes
* in length.
* \invariant xs.request_mutex exclusively locked.
*/
static int
xs_write_store(const void *tdata, unsigned len)
{
XENSTORE_RING_IDX cons, prod;
const char *data = (const char *)tdata;
int error;
sx_assert(&xs.request_mutex, SX_XLOCKED);
while (len != 0) {
void *dst;
u_int avail;
/* Hold lock so we can't miss wakeups should we block. */
mtx_lock(&xs.ring_lock);
cons = xen_store->req_cons;
prod = xen_store->req_prod;
if ((prod - cons) == XENSTORE_RING_SIZE) {
/*
* Output ring is full. Wait for a ring event.
*
* Note that the events from both queues
* are combined, so being woken does not
* guarantee that data exist in the read
* ring.
*
* To simplify error recovery and the retry,
* we specify PDROP so our lock is *not* held
* when msleep returns.
*/
error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
"xbwrite", /*timeout*/0);
if (error && error != EWOULDBLOCK)
return (error);
/* Try again. */
continue;
}
mtx_unlock(&xs.ring_lock);
/* Verify queue sanity. */
if (!xs_check_indexes(cons, prod)) {
xen_store->req_cons = xen_store->req_prod = 0;
return (EIO);
}
dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail);
if (avail > len)
avail = len;
memcpy(dst, data, avail);
data += avail;
len -= avail;
/*
* The store to the producer index, which indicates
* to the other side that new data has arrived, must
* be visible only after our copy of the data into the
* ring has completed.
*/
wmb();
xen_store->req_prod += avail;
/*
* notify_remote_via_evtchn implies mb(). The other side
* will see the change to req_prod at the time of the
* interrupt.
*/
notify_remote_via_evtchn(xs.evtchn);
}
return (0);
}
/**
* Receive data from the XenStore service.
*
* \param tdata A pointer to the contiguous buffer to receive the data.
* \param len The amount of data to receive.
*
* \return On success 0, otherwise an errno value indicating the
* cause of failure.
*
* \invariant Called from thread context.
* \invariant The buffer pointed to by tdata is at least len bytes
* in length.
*
* \note xs_read does not perform any internal locking to guarantee
* serial access to the incoming ring buffer. However, there
* is only one context processing reads: xs_rcv_thread().
*/
static int
xs_read_store(void *tdata, unsigned len)
{
XENSTORE_RING_IDX cons, prod;
char *data = (char *)tdata;
int error;
while (len != 0) {
u_int avail;
const char *src;
/* Hold lock so we can't miss wakeups should we block. */
mtx_lock(&xs.ring_lock);
cons = xen_store->rsp_cons;
prod = xen_store->rsp_prod;
if (cons == prod) {
/*
* Nothing to read. Wait for a ring event.
*
* Note that the events from both queues
* are combined, so being woken does not
* guarantee that data exist in the read
* ring.
*
* To simplify error recovery and the retry,
* we specify PDROP so our lock is *not* held
* when msleep returns.
*/
error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
"xbread", /*timout*/0);
if (error && error != EWOULDBLOCK)
return (error);
continue;
}
mtx_unlock(&xs.ring_lock);
/* Verify queue sanity. */
if (!xs_check_indexes(cons, prod)) {
xen_store->rsp_cons = xen_store->rsp_prod = 0;
return (EIO);
}
src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail);
if (avail > len)
avail = len;
/*
* Insure the data we read is related to the indexes
* we read above.
*/
rmb();
memcpy(data, src, avail);
data += avail;
len -= avail;
/*
* Insure that the producer of this ring does not see
* the ring space as free until after we have copied it
* out.
*/
mb();
xen_store->rsp_cons += avail;
/*
* notify_remote_via_evtchn implies mb(). The producer
* will see the updated consumer index when the event
* is delivered.
*/
notify_remote_via_evtchn(xs.evtchn);
}
return (0);
}
/*----------------------- Received Message Processing ------------------------*/
/**
* Block reading the next message from the XenStore service and
* process the result.
*
* \param type The returned type of the XenStore message received.
*
* \return 0 on success. Otherwise an errno value indicating the
* type of failure encountered.
*/
static int
xs_process_msg(enum xsd_sockmsg_type *type)
{
struct xs_stored_msg *msg;
char *body;
int error;
msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK);
error = xs_read_store(&msg->hdr, sizeof(msg->hdr));
if (error) {
free(msg, M_XENSTORE);
return (error);
}
body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK);
error = xs_read_store(body, msg->hdr.len);
if (error) {
free(body, M_XENSTORE);
free(msg, M_XENSTORE);
return (error);
}
body[msg->hdr.len] = '\0';
*type = msg->hdr.type;
if (msg->hdr.type == XS_WATCH_EVENT) {
msg->u.watch.vec = split(body, msg->hdr.len,
&msg->u.watch.vec_size);
mtx_lock(&xs.registered_watches_lock);
msg->u.watch.handle = find_watch(
msg->u.watch.vec[XS_WATCH_TOKEN]);
if (msg->u.watch.handle != NULL) {
mtx_lock(&xs.watch_events_lock);
TAILQ_INSERT_TAIL(&xs.watch_events, msg, list);
wakeup(&xs.watch_events);
mtx_unlock(&xs.watch_events_lock);
} else {
free(msg->u.watch.vec, M_XENSTORE);
free(msg, M_XENSTORE);
}
mtx_unlock(&xs.registered_watches_lock);
} else {
msg->u.reply.body = body;
mtx_lock(&xs.reply_lock);
TAILQ_INSERT_TAIL(&xs.reply_list, msg, list);
wakeup(&xs.reply_list);
mtx_unlock(&xs.reply_lock);
}
return (0);
}
/**
* Thread body of the XenStore receive thread.
*
* This thread blocks waiting for data from the XenStore service
* and processes and received messages.
*/
static void
xs_rcv_thread(void *arg __unused)
{
int error;
enum xsd_sockmsg_type type;
for (;;) {
error = xs_process_msg(&type);
if (error)
printf("XENSTORE error %d while reading message\n",
error);
}
}
/*---------------- XenStore Message Request/Reply Processing -----------------*/
/**
* Filter invoked before transmitting any message to the XenStore service.
*
* The role of the filter may expand, but currently serves to manage
* the interactions of messages with transaction state.
*
* \param request_msg_type The message type for the request.
*/
static inline void
xs_request_filter(uint32_t request_msg_type)
{
if (request_msg_type == XS_TRANSACTION_START)
sx_slock(&xs.suspend_mutex);
}
/**
* Filter invoked after transmitting any message to the XenStore service.
*
* The role of the filter may expand, but currently serves to manage
* the interactions of messages with transaction state.
*
* \param request_msg_type The message type for the original request.
* \param reply_msg_type The message type for any received reply.
* \param request_reply_error The error status from the attempt to send
* the request or retrieve the reply.
*/
static inline void
xs_reply_filter(uint32_t request_msg_type,
uint32_t reply_msg_type, int request_reply_error)
{
/*
* The count of transactions drops if we attempted
* to end a transaction (even if that attempt fails
* in error), we receive a transaction end acknowledgement,
* or if our attempt to begin a transaction fails.
*/
if (request_msg_type == XS_TRANSACTION_END
|| (request_reply_error == 0 && reply_msg_type == XS_TRANSACTION_END)
|| (request_msg_type == XS_TRANSACTION_START
&& (request_reply_error != 0 || reply_msg_type == XS_ERROR)))
sx_sunlock(&xs.suspend_mutex);
}
#define xsd_error_count (sizeof(xsd_errors) / sizeof(xsd_errors[0]))
/**
* Convert a XenStore error string into an errno number.
*
* \param errorstring The error string to convert.
*
* \return The errno best matching the input string.
*
* \note Unknown error strings are converted to EINVAL.
*/
static int
xs_get_error(const char *errorstring)
{
u_int i;
for (i = 0; i < xsd_error_count; i++) {
if (!strcmp(errorstring, xsd_errors[i].errstring))
return (xsd_errors[i].errnum);
}
log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s",
errorstring);
return (EINVAL);
}
/**
* Block waiting for a reply to a message request.
*
* \param type The returned type of the reply.
* \param len The returned body length of the reply.
* \param result The returned body of the reply.
*
* \return 0 on success. Otherwise an errno indicating the
* cause of failure.
*/
static int
xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result)
{
struct xs_stored_msg *msg;
char *body;
int error;
mtx_lock(&xs.reply_lock);
while (TAILQ_EMPTY(&xs.reply_list)) {
error = mtx_sleep(&xs.reply_list, &xs.reply_lock,
PCATCH, "xswait", hz/10);
if (error && error != EWOULDBLOCK) {
mtx_unlock(&xs.reply_lock);
return (error);
}
}
msg = TAILQ_FIRST(&xs.reply_list);
TAILQ_REMOVE(&xs.reply_list, msg, list);
mtx_unlock(&xs.reply_lock);
*type = msg->hdr.type;
if (len)
*len = msg->hdr.len;
body = msg->u.reply.body;
free(msg, M_XENSTORE);
*result = body;
return (0);
}
/**
* Pass-thru interface for XenStore access by userland processes
* via the XenStore device.
*
* Reply type and length data are returned by overwriting these
* fields in the passed in request message.
*
* \param msg A properly formatted message to transmit to
* the XenStore service.
* \param result The returned body of the reply.
*
* \return 0 on success. Otherwise an errno indicating the cause
* of failure.
*
* \note The returned result is provided in malloced storage and thus
* must be free'd by the caller with 'free(result, M_XENSTORE);
*/
int
xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
{
uint32_t request_type;
int error;
request_type = msg->type;
xs_request_filter(request_type);
sx_xlock(&xs.request_mutex);
if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0)
error = xs_read_reply(&msg->type, &msg->len, result);
sx_xunlock(&xs.request_mutex);
xs_reply_filter(request_type, msg->type, error);
return (error);
}
/**
* Send a message with an optionally muti-part body to the XenStore service.
*
* \param t The transaction to use for this request.
* \param request_type The type of message to send.
* \param iovec Pointers to the body sections of the request.
* \param num_vecs The number of body sections in the request.
* \param len The returned length of the reply.
* \param result The returned body of the reply.
*
* \return 0 on success. Otherwise an errno indicating
* the cause of failure.
*
* \note The returned result is provided in malloced storage and thus
* must be free'd by the caller with 'free(*result, M_XENSTORE);
*/
static int
xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type,
const struct iovec *iovec, u_int num_vecs, u_int *len, void **result)
{
struct xsd_sockmsg msg;
void *ret = NULL;
u_int i;
int error;
msg.tx_id = t.id;
msg.req_id = 0;
msg.type = request_type;
msg.len = 0;
for (i = 0; i < num_vecs; i++)
msg.len += iovec[i].iov_len;
xs_request_filter(request_type);
sx_xlock(&xs.request_mutex);
error = xs_write_store(&msg, sizeof(msg));
if (error) {
printf("xs_talkv failed %d\n", error);
goto error_lock_held;
}
for (i = 0; i < num_vecs; i++) {
error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len);
if (error) {
printf("xs_talkv failed %d\n", error);
goto error_lock_held;
}
}
error = xs_read_reply(&msg.type, len, &ret);
error_lock_held:
sx_xunlock(&xs.request_mutex);
xs_reply_filter(request_type, msg.type, error);
if (error)
return (error);
if (msg.type == XS_ERROR) {
error = xs_get_error(ret);
free(ret, M_XENSTORE);
return (error);
}
/* Reply is either error or an echo of our request message type. */
KASSERT(msg.type == request_type, ("bad xenstore message type"));
if (result)
*result = ret;
else
free(ret, M_XENSTORE);
return (0);
}
/**
* Wrapper for xs_talkv allowing easy transmission of a message with
* a single, contiguous, message body.
*
* \param t The transaction to use for this request.
* \param request_type The type of message to send.
* \param body The body of the request.
* \param len The returned length of the reply.
* \param result The returned body of the reply.
*
* \return 0 on success. Otherwise an errno indicating
* the cause of failure.
*
* \note The returned result is provided in malloced storage and thus
* must be free'd by the caller with 'free(*result, M_XENSTORE);
*/
static int
xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type,
const char *body, u_int *len, void **result)
{
struct iovec iovec;
iovec.iov_base = (void *)(uintptr_t)body;
iovec.iov_len = strlen(body) + 1;
return (xs_talkv(t, request_type, &iovec, 1, len, result));
}
/*------------------------- XenStore Watch Support ---------------------------*/
/**
* Transmit a watch request to the XenStore service.
*
* \param path The path in the XenStore to watch.
* \param tocken A unique identifier for this watch.
*
* \return 0 on success. Otherwise an errno indicating the
* cause of failure.
*/
static int
xs_watch(const char *path, const char *token)
{
struct iovec iov[2];
iov[0].iov_base = (void *)(uintptr_t) path;
iov[0].iov_len = strlen(path) + 1;
iov[1].iov_base = (void *)(uintptr_t) token;
iov[1].iov_len = strlen(token) + 1;
return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL));
}
/**
* Transmit an uwatch request to the XenStore service.
*
* \param path The path in the XenStore to watch.
* \param tocken A unique identifier for this watch.
*
* \return 0 on success. Otherwise an errno indicating the
* cause of failure.
*/
static int
xs_unwatch(const char *path, const char *token)
{
struct iovec iov[2];
iov[0].iov_base = (void *)(uintptr_t) path;
iov[0].iov_len = strlen(path) + 1;
iov[1].iov_base = (void *)(uintptr_t) token;
iov[1].iov_len = strlen(token) + 1;
return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL));
}
/**
* Convert from watch token (unique identifier) to the associated
* internal tracking structure for this watch.
*
* \param tocken The unique identifier for the watch to find.
*
* \return A pointer to the found watch structure or NULL.
*/
static struct xs_watch *
find_watch(const char *token)
{
struct xs_watch *i, *cmp;
cmp = (void *)strtoul(token, NULL, 16);
LIST_FOREACH(i, &xs.registered_watches, list)
if (i == cmp)
return (i);
return (NULL);
}
/**
* Thread body of the XenStore watch event dispatch thread.
*/
static void
xenwatch_thread(void *unused)
{
struct xs_stored_msg *msg;
for (;;) {
mtx_lock(&xs.watch_events_lock);
while (TAILQ_EMPTY(&xs.watch_events))
mtx_sleep(&xs.watch_events,
&xs.watch_events_lock,
PWAIT | PCATCH, "waitev", hz/10);
mtx_unlock(&xs.watch_events_lock);
sx_xlock(&xs.xenwatch_mutex);
mtx_lock(&xs.watch_events_lock);
msg = TAILQ_FIRST(&xs.watch_events);
if (msg)
TAILQ_REMOVE(&xs.watch_events, msg, list);
mtx_unlock(&xs.watch_events_lock);
if (msg != NULL) {
/*
* XXX There are messages coming in with a NULL
* XXX callback. This deserves further investigation;
* XXX the workaround here simply prevents the kernel
* XXX from panic'ing on startup.
*/
if (msg->u.watch.handle->callback != NULL)
msg->u.watch.handle->callback(
msg->u.watch.handle,
(const char **)msg->u.watch.vec,
msg->u.watch.vec_size);
free(msg->u.watch.vec, M_XENSTORE);
free(msg, M_XENSTORE);
}
sx_xunlock(&xs.xenwatch_mutex);
}
}
/*----------- XenStore Configuration, Initialization, and Control ------------*/
/**
* Setup communication channels with the XenStore service.
*
* \return On success, 0. Otherwise an errno value indicating the
* type of failure.
*/
static int
xs_init_comms(void)
{
int error;
if (xen_store->rsp_prod != xen_store->rsp_cons) {
log(LOG_WARNING, "XENSTORE response ring is not quiescent "
"(%08x:%08x): fixing up\n",
xen_store->rsp_cons, xen_store->rsp_prod);
xen_store->rsp_cons = xen_store->rsp_prod;
}
if (xs.irq)
unbind_from_irqhandler(xs.irq);
error = bind_caller_port_to_irqhandler(xs.evtchn, "xenstore",
xs_intr, NULL, INTR_TYPE_NET, &xs.irq);
if (error) {
log(LOG_WARNING, "XENSTORE request irq failed %i\n", error);
return (error);
}
return (0);
}
/*------------------ Private Device Attachment Functions --------------------*/
static void
xs_identify(driver_t *driver, device_t parent)
{
BUS_ADD_CHILD(parent, 0, "xenstore", 0);
}
/**
* Probe for the existance of the XenStore.
*
* \param dev
*/
static int
xs_probe(device_t dev)
{
/*
* We are either operating within a PV kernel or being probed
* as the child of the successfully attached xenpci device.
* Thus we are in a Xen environment and there will be a XenStore.
* Unconditionally return success.
*/
device_set_desc(dev, "XenStore");
return (0);
}
static void
xs_attach_deferred(void *arg)
{
xs_dev_init();
bus_generic_probe(xs.xs_dev);
bus_generic_attach(xs.xs_dev);
config_intrhook_disestablish(&xs.xs_attachcb);
}
/**
* Attach to the XenStore.
*
* This routine also prepares for the probe/attach of drivers that rely
* on the XenStore.
*/
static int
xs_attach(device_t dev)
{
int error;
/* Allow us to get device_t from softc and vice-versa. */
xs.xs_dev = dev;
device_set_softc(dev, &xs);
/*
* This seems to be a layering violation. The XenStore is just
* one of many clients of the Grant Table facility. It happens
* to be the first and a gating consumer to all other devices,
* so this does work. A better place would be in the PV support
* code for fully PV kernels and the xenpci driver for HVM kernels.
*/
error = gnttab_init();
if (error != 0) {
log(LOG_WARNING,
"XENSTORE: Error initializing grant tables: %d\n", error);
return (ENXIO);
}
/* Initialize the interface to xenstore. */
struct proc *p;
#ifdef XENHVM
xs.evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
xs.gpfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
xen_store = pmap_mapdev(xs.gpfn * PAGE_SIZE, PAGE_SIZE);
#else
xs.evtchn = xen_start_info->store_evtchn;
#endif
TAILQ_INIT(&xs.reply_list);
TAILQ_INIT(&xs.watch_events);
mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF);
mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF);
sx_init(&xs.xenwatch_mutex, "xenwatch");
sx_init(&xs.request_mutex, "xenstore request");
sx_init(&xs.suspend_mutex, "xenstore suspend");
mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF);
mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF);
xs.irq = 0;
/* Initialize the shared memory rings to talk to xenstored */
error = xs_init_comms();
if (error)
return (error);
error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID,
0, "xenwatch");
if (error)
return (error);
xs.xenwatch_pid = p->p_pid;
error = kproc_create(xs_rcv_thread, NULL, NULL,
RFHIGHPID, 0, "xenstore_rcv");
xs.xs_attachcb.ich_func = xs_attach_deferred;
xs.xs_attachcb.ich_arg = NULL;
config_intrhook_establish(&xs.xs_attachcb);
return (error);
}
/**
* Prepare for suspension of this VM by halting XenStore access after
* all transactions and individual requests have completed.
*/
static int
xs_suspend(device_t dev)
{
int error;
/* Suspend child Xen devices. */
error = bus_generic_suspend(dev);
if (error != 0)
return (error);
sx_xlock(&xs.suspend_mutex);
sx_xlock(&xs.request_mutex);
return (0);
}
/**
* Resume XenStore operations after this VM is resumed.
*/
static int
xs_resume(device_t dev __unused)
{
struct xs_watch *watch;
char token[sizeof(watch) * 2 + 1];
xs_init_comms();
sx_xunlock(&xs.request_mutex);
/*
* No need for registered_watches_lock: the suspend_mutex
* is sufficient.
*/
LIST_FOREACH(watch, &xs.registered_watches, list) {
sprintf(token, "%lX", (long)watch);
xs_watch(watch->node, token);
}
sx_xunlock(&xs.suspend_mutex);
/* Resume child Xen devices. */
bus_generic_resume(dev);
return (0);
}
/*-------------------- Private Device Attachment Data -----------------------*/
static device_method_t xenstore_methods[] = {
/* Device interface */
DEVMETHOD(device_identify, xs_identify),
DEVMETHOD(device_probe, xs_probe),
DEVMETHOD(device_attach, xs_attach),
DEVMETHOD(device_detach, bus_generic_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD(device_suspend, xs_suspend),
DEVMETHOD(device_resume, xs_resume),
/* Bus interface */
DEVMETHOD(bus_add_child, bus_generic_add_child),
DEVMETHOD(bus_print_child, bus_generic_print_child),
DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource),
DEVMETHOD(bus_release_resource, bus_generic_release_resource),
DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
{ 0, 0 }
};
DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0);
static devclass_t xenstore_devclass;
#ifdef XENHVM
DRIVER_MODULE(xenstore, xenpci, xenstore_driver, xenstore_devclass, 0, 0);
#else
DRIVER_MODULE(xenstore, nexus, xenstore_driver, xenstore_devclass, 0, 0);
#endif
/*------------------------------- Sysctl Data --------------------------------*/
/* XXX Shouldn't the node be somewhere else? */
SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, "");
SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
/*-------------------------------- Public API --------------------------------*/
/*------- API comments for these methods can be found in xenstorevar.h -------*/
int
xs_directory(struct xs_transaction t, const char *dir, const char *node,
u_int *num, const char ***result)
{
struct sbuf *path;
char *strings;
u_int len = 0;
int error;
path = xs_join(dir, node);
error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len,
(void **)&strings);
sbuf_delete(path);
if (error)
return (error);
*result = split(strings, len, num);
return (0);
}
int
xs_exists(struct xs_transaction t, const char *dir, const char *node)
{
const char **d;
int error, dir_n;
error = xs_directory(t, dir, node, &dir_n, &d);
if (error)
return (0);
free(d, M_XENSTORE);
return (1);
}
int
xs_read(struct xs_transaction t, const char *dir, const char *node,
u_int *len, void **result)
{
struct sbuf *path;
void *ret;
int error;
path = xs_join(dir, node);
error = xs_single(t, XS_READ, sbuf_data(path), len, &ret);
sbuf_delete(path);
if (error)
return (error);
*result = ret;
return (0);
}
int
xs_write(struct xs_transaction t, const char *dir, const char *node,
const char *string)
{
struct sbuf *path;
struct iovec iovec[2];
int error;
path = xs_join(dir, node);
iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path);
iovec[0].iov_len = sbuf_len(path) + 1;
iovec[1].iov_base = (void *)(uintptr_t) string;
iovec[1].iov_len = strlen(string);
error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL);
sbuf_delete(path);
return (error);
}
int
xs_mkdir(struct xs_transaction t, const char *dir, const char *node)
{
struct sbuf *path;
int ret;
path = xs_join(dir, node);
ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL);
sbuf_delete(path);
return (ret);
}
int
xs_rm(struct xs_transaction t, const char *dir, const char *node)
{
struct sbuf *path;
int ret;
path = xs_join(dir, node);
ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL);
sbuf_delete(path);
return (ret);
}
int
xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node)
{
struct xs_transaction local_xbt;
struct sbuf *root_path_sbuf;
struct sbuf *cur_path_sbuf;
char *root_path;
char *cur_path;
const char **dir;
int error;
int empty;
retry:
root_path_sbuf = xs_join(base, node);
cur_path_sbuf = xs_join(base, node);
root_path = sbuf_data(root_path_sbuf);
cur_path = sbuf_data(cur_path_sbuf);
dir = NULL;
local_xbt.id = 0;
if (xbt.id == 0) {
error = xs_transaction_start(&local_xbt);
if (error != 0)
goto out;
xbt = local_xbt;
}
empty = 0;
while (1) {
u_int count;
u_int i;
error = xs_directory(xbt, cur_path, "", &count, &dir);
if (error)
goto out;
for (i = 0; i < count; i++) {
error = xs_rm(xbt, cur_path, dir[i]);
if (error == ENOTEMPTY) {
struct sbuf *push_dir;
/*
* Descend to clear out this sub directory.
* We'll return to cur_dir once push_dir
* is empty.
*/
push_dir = xs_join(cur_path, dir[i]);
sbuf_delete(cur_path_sbuf);
cur_path_sbuf = push_dir;
cur_path = sbuf_data(cur_path_sbuf);
break;
} else if (error != 0) {
goto out;
}
}
free(dir, M_XENSTORE);
dir = NULL;
if (i == count) {
char *last_slash;
/* Directory is empty. It is now safe to remove. */
error = xs_rm(xbt, cur_path, "");
if (error != 0)
goto out;
if (!strcmp(cur_path, root_path))
break;
/* Return to processing the parent directory. */
last_slash = strrchr(cur_path, '/');
KASSERT(last_slash != NULL,
("xs_rm_tree: mangled path %s", cur_path));
*last_slash = '\0';
}
}
out:
sbuf_delete(cur_path_sbuf);
sbuf_delete(root_path_sbuf);
if (dir != NULL)
free(dir, M_XENSTORE);
if (local_xbt.id != 0) {
int terror;
terror = xs_transaction_end(local_xbt, /*abort*/error != 0);
xbt.id = 0;
if (terror == EAGAIN && error == 0)
goto retry;
}
return (error);
}
int
xs_transaction_start(struct xs_transaction *t)
{
char *id_str;
int error;
error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL,
(void **)&id_str);
if (error == 0) {
t->id = strtoul(id_str, NULL, 0);
free(id_str, M_XENSTORE);
}
return (error);
}
int
xs_transaction_end(struct xs_transaction t, int abort)
{
char abortstr[2];
if (abort)
strcpy(abortstr, "F");
else
strcpy(abortstr, "T");
return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL));
}
int
xs_scanf(struct xs_transaction t, const char *dir, const char *node,
int *scancountp, const char *fmt, ...)
{
va_list ap;
int error, ns;
char *val;
error = xs_read(t, dir, node, NULL, (void **) &val);
if (error)
return (error);
va_start(ap, fmt);
ns = vsscanf(val, fmt, ap);
va_end(ap);
free(val, M_XENSTORE);
/* Distinctive errno. */
if (ns == 0)
return (ERANGE);
if (scancountp)
*scancountp = ns;
return (0);
}
int
xs_vprintf(struct xs_transaction t,
const char *dir, const char *node, const char *fmt, va_list ap)
{
struct sbuf *sb;
int error;
sb = sbuf_new_auto();
sbuf_vprintf(sb, fmt, ap);
sbuf_finish(sb);
error = xs_write(t, dir, node, sbuf_data(sb));
sbuf_delete(sb);
return (error);
}
int
xs_printf(struct xs_transaction t, const char *dir, const char *node,
const char *fmt, ...)
{
va_list ap;
int error;
va_start(ap, fmt);
error = xs_vprintf(t, dir, node, fmt, ap);
va_end(ap);
return (error);
}
int
xs_gather(struct xs_transaction t, const char *dir, ...)
{
va_list ap;
const char *name;
int error;
va_start(ap, dir);
error = 0;
while (error == 0 && (name = va_arg(ap, char *)) != NULL) {
const char *fmt = va_arg(ap, char *);
void *result = va_arg(ap, void *);
char *p;
error = xs_read(t, dir, name, NULL, (void **) &p);
if (error)
break;
if (fmt) {
if (sscanf(p, fmt, result) == 0)
error = EINVAL;
free(p, M_XENSTORE);
} else
*(char **)result = p;
}
va_end(ap);
return (error);
}
int
xs_register_watch(struct xs_watch *watch)
{
/* Pointer in ascii is the token. */
char token[sizeof(watch) * 2 + 1];
int error;
sprintf(token, "%lX", (long)watch);
sx_slock(&xs.suspend_mutex);
mtx_lock(&xs.registered_watches_lock);
KASSERT(find_watch(token) == NULL, ("watch already registered"));
LIST_INSERT_HEAD(&xs.registered_watches, watch, list);
mtx_unlock(&xs.registered_watches_lock);
error = xs_watch(watch->node, token);
/* Ignore errors due to multiple registration. */
if (error == EEXIST)
error = 0;
if (error != 0) {
mtx_lock(&xs.registered_watches_lock);
LIST_REMOVE(watch, list);
mtx_unlock(&xs.registered_watches_lock);
}
sx_sunlock(&xs.suspend_mutex);
return (error);
}
void
xs_unregister_watch(struct xs_watch *watch)
{
struct xs_stored_msg *msg, *tmp;
char token[sizeof(watch) * 2 + 1];
int error;
sprintf(token, "%lX", (long)watch);
sx_slock(&xs.suspend_mutex);
mtx_lock(&xs.registered_watches_lock);
if (find_watch(token) == NULL) {
mtx_unlock(&xs.registered_watches_lock);
sx_sunlock(&xs.suspend_mutex);
return;
}
LIST_REMOVE(watch, list);
mtx_unlock(&xs.registered_watches_lock);
error = xs_unwatch(watch->node, token);
if (error)
log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n",
watch->node, error);
sx_sunlock(&xs.suspend_mutex);
/* Cancel pending watch events. */
mtx_lock(&xs.watch_events_lock);
TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) {
if (msg->u.watch.handle != watch)
continue;
TAILQ_REMOVE(&xs.watch_events, msg, list);
free(msg->u.watch.vec, M_XENSTORE);
free(msg, M_XENSTORE);
}
mtx_unlock(&xs.watch_events_lock);
/* Flush any currently-executing callback, unless we are it. :-) */
if (curproc->p_pid != xs.xenwatch_pid) {
sx_xlock(&xs.xenwatch_mutex);
sx_xunlock(&xs.xenwatch_mutex);
}
}