Implement zero-copy iSCSI target transmission/read.

Add ICL_NOCOPY flag to icl_pdu_append_data(), specifying that the method
can just reference the data buffer instead of immediately copying it.

Extend the offload KPI with optional PDU queue method, allowing to specify
completion callback, called when all the data referenced by above has been
transferred and won't be accessed any more (the buffers can be freed).

Implement the above functionality in software iSCSI driver using mbufs
with external storage and reference counter.  Note that some NICs (ixl(4))
may keep the mbuf in TX queue for a long time, so CTL has to be ready.

Add optional method to struct ctl_scsiio for buffer reference counting.
Implement it for CTL block backend, allowing to delay free of the struct
ctl_be_block_io and memory it references as needed.  In first reincarnation
of the patch I tried to delay whole I/O as it is done for FibreChannel,
that was cleaner, but due to the above callback delays I had to rewrite
it this way to not leave LUN referenced potentially for hours or more.

All together on sequential read from ZFS ARC this saves about 30% of CPU
time and memory bandwidth by avoiding one of 3 memory copies (the other
two are from ZFS ARC to DMU cache and then from DMU cache to CTL buffers).
On tests with 2x Xeon Silver 4114 this allows to reach full line rate of
100GigE NIC.  Tests with Gold CPUs and two 100GigE NICs are stil TBD,
but expectations to saturate them are pretty high. ;)

Discussed with:	Chelsio
Sponsored by:	iXsystems, Inc.
This commit is contained in:
Alexander Motin 2020-06-08 20:53:57 +00:00
parent c78cd98b8a
commit 9a4510ac32
7 changed files with 201 additions and 44 deletions

View File

@ -201,6 +201,7 @@ struct ctl_be_block_io {
union ctl_io *io;
struct ctl_sg_entry sg_segs[CTLBLK_MAX_SEGS];
struct iovec xiovecs[CTLBLK_MAX_SEGS];
int refcnt;
int bio_cmd;
int two_sglists;
int num_segs;
@ -305,11 +306,12 @@ ctl_alloc_beio(struct ctl_be_block_softc *softc)
beio = uma_zalloc(softc->beio_zone, M_WAITOK | M_ZERO);
beio->softc = softc;
beio->refcnt = 1;
return (beio);
}
static void
ctl_free_beio(struct ctl_be_block_io *beio)
ctl_real_free_beio(struct ctl_be_block_io *beio)
{
struct ctl_be_block_softc *softc = beio->softc;
int i;
@ -327,6 +329,22 @@ ctl_free_beio(struct ctl_be_block_io *beio)
uma_zfree(softc->beio_zone, beio);
}
static void
ctl_refcnt_beio(void *arg, int diff)
{
struct ctl_be_block_io *beio = arg;
if (atomic_fetchadd_int(&beio->refcnt, diff) + diff == 0)
ctl_real_free_beio(beio);
}
static void
ctl_free_beio(struct ctl_be_block_io *beio)
{
ctl_refcnt_beio(beio, -1);
}
static void
ctl_complete_beio(struct ctl_be_block_io *beio)
{
@ -1613,6 +1631,8 @@ ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
io->scsiio.kern_data_len = beio->io_len;
io->scsiio.kern_sg_entries = beio->num_segs;
io->scsiio.kern_data_ref = ctl_refcnt_beio;
io->scsiio.kern_data_arg = beio;
io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
/*

View File

@ -424,6 +424,17 @@ cfiscsi_pdu_queue(struct icl_pdu *response)
CFISCSI_SESSION_UNLOCK(cs);
}
static void
cfiscsi_pdu_queue_cb(struct icl_pdu *response, icl_pdu_cb cb)
{
struct cfiscsi_session *cs = PDU_SESSION(response);
CFISCSI_SESSION_LOCK(cs);
cfiscsi_pdu_prepare(response);
icl_pdu_queue_cb(response, cb);
CFISCSI_SESSION_UNLOCK(cs);
}
static void
cfiscsi_pdu_handle_nop_out(struct icl_pdu *request)
{
@ -2416,6 +2427,15 @@ cfiscsi_target_find_or_create(struct cfiscsi_softc *softc, const char *name,
return (newct);
}
static void
cfiscsi_pdu_done(struct icl_pdu *ip, int error)
{
if (error != 0)
; // XXX: Do something on error?
((ctl_ref)ip->ip_prv0)(ip->ip_prv1, -1);
}
static void
cfiscsi_datamove_in(union ctl_io *io)
{
@ -2426,6 +2446,7 @@ cfiscsi_datamove_in(union ctl_io *io)
struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
size_t len, expected_len, sg_len, buffer_offset;
const char *sg_addr;
icl_pdu_cb cb;
int ctl_sg_count, error, i;
request = PRIV_REQUEST(io);
@ -2471,6 +2492,11 @@ cfiscsi_datamove_in(union ctl_io *io)
return;
}
if (io->scsiio.kern_data_ref != NULL)
cb = cfiscsi_pdu_done;
else
cb = NULL;
i = 0;
sg_addr = NULL;
sg_len = 0;
@ -2534,7 +2560,8 @@ cfiscsi_datamove_in(union ctl_io *io)
len, sg_len));
}
error = icl_pdu_append_data(response, sg_addr, len, M_NOWAIT);
error = icl_pdu_append_data(response, sg_addr, len,
M_NOWAIT | (cb ? ICL_NOCOPY : 0));
if (error != 0) {
CFISCSI_SESSION_WARN(cs, "failed to "
"allocate memory; dropping connection");
@ -2587,7 +2614,12 @@ cfiscsi_datamove_in(union ctl_io *io)
buffer_offset -= response->ip_data_len;
break;
}
cfiscsi_pdu_queue(response);
if (cb != NULL) {
response->ip_prv0 = io->scsiio.kern_data_ref;
response->ip_prv1 = io->scsiio.kern_data_arg;
io->scsiio.kern_data_ref(io->scsiio.kern_data_arg, 1);
}
cfiscsi_pdu_queue_cb(response, cb);
response = NULL;
bhsdi = NULL;
}
@ -2617,7 +2649,12 @@ cfiscsi_datamove_in(union ctl_io *io)
}
}
KASSERT(response->ip_data_len > 0, ("sending empty Data-In"));
cfiscsi_pdu_queue(response);
if (cb != NULL) {
response->ip_prv0 = io->scsiio.kern_data_ref;
response->ip_prv1 = io->scsiio.kern_data_arg;
io->scsiio.kern_data_ref(io->scsiio.kern_data_arg, 1);
}
cfiscsi_pdu_queue_cb(response, cb);
}
io->scsiio.be_move_done(io);

View File

@ -257,6 +257,8 @@ typedef enum {
union ctl_io;
typedef void (*ctl_ref)(void *arg, int diff);
/*
* SCSI passthrough I/O structure for the CAM Target Layer. Note
* that some of these fields are here for completeness, but they aren't
@ -329,6 +331,8 @@ struct ctl_scsiio {
uint8_t cdb[CTL_MAX_CDBLEN]; /* CDB */
int (*be_move_done)(union ctl_io *io); /* called by fe */
int (*io_cont)(union ctl_io *io); /* to continue processing */
ctl_ref kern_data_ref; /* Method to reference/release data */
void *kern_data_arg; /* Opaque argument for kern_data_ref() */
};
typedef enum {

View File

@ -79,9 +79,8 @@ struct icl_pdu {
/*
* User (initiator or provider) private fields.
*/
uint32_t ip_prv0;
uint32_t ip_prv1;
uint32_t ip_prv2;
void *ip_prv0;
void *ip_prv1;
};
#define ICL_CONN_STATE_INVALID 0
@ -93,6 +92,8 @@ struct icl_pdu {
#define ICL_MAX_DATA_SEGMENT_LENGTH (128 * 1024)
#define ICL_NOCOPY (1 << 30)
struct icl_conn {
KOBJ_FIELDS;
struct mtx *ic_lock;
@ -136,6 +137,8 @@ struct icl_drv_limits {
int spare[4];
};
typedef void (*icl_pdu_cb)(struct icl_pdu *, int error);
struct icl_conn *icl_new_conn(const char *offload, bool iser, const char *name,
struct mtx *lock);
int icl_limits(const char *offload, bool iser,

View File

@ -36,6 +36,16 @@
INTERFACE icl_conn;
CODE {
static void null_pdu_queue_cb(struct icl_conn *ic,
struct icl_pdu *ip, icl_pdu_cb cb)
{
ICL_CONN_PDU_QUEUE(ic, ip);
if (cb)
cb(ip, 0);
}
};
METHOD size_t pdu_data_segment_length {
struct icl_conn *_ic;
const struct icl_pdu *_ip;
@ -62,6 +72,12 @@ METHOD void pdu_queue {
struct icl_pdu *_ip;
};
METHOD void pdu_queue_cb {
struct icl_conn *_ic;
struct icl_pdu *_ip;
icl_pdu_cb cb;
} DEFAULT null_pdu_queue_cb;
METHOD void pdu_free {
struct icl_conn *_ic;
struct icl_pdu *_ip;

View File

@ -64,6 +64,15 @@ __FBSDID("$FreeBSD$");
#include <dev/iscsi/iscsi_proto.h>
#include <icl_conn_if.h>
struct icl_soft_pdu {
struct icl_pdu ip;
/* soft specific stuff goes here. */
u_int ref_cnt;
icl_pdu_cb cb;
int error;
};
static int coalesce = 1;
SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN,
&coalesce, 0, "Try to coalesce PDUs before sending");
@ -79,7 +88,7 @@ SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
&recvspace, 0, "Default receive socket buffer size");
static MALLOC_DEFINE(M_ICL_SOFT, "icl_soft", "iSCSI software backend");
static uma_zone_t icl_pdu_zone;
static uma_zone_t icl_soft_pdu_zone;
static volatile u_int icl_ncons;
@ -97,6 +106,7 @@ static icl_conn_pdu_data_segment_length_t
static icl_conn_pdu_append_data_t icl_soft_conn_pdu_append_data;
static icl_conn_pdu_get_data_t icl_soft_conn_pdu_get_data;
static icl_conn_pdu_queue_t icl_soft_conn_pdu_queue;
static icl_conn_pdu_queue_cb_t icl_soft_conn_pdu_queue_cb;
static icl_conn_handoff_t icl_soft_conn_handoff;
static icl_conn_free_t icl_soft_conn_free;
static icl_conn_close_t icl_soft_conn_close;
@ -116,6 +126,7 @@ static kobj_method_t icl_soft_methods[] = {
KOBJMETHOD(icl_conn_pdu_append_data, icl_soft_conn_pdu_append_data),
KOBJMETHOD(icl_conn_pdu_get_data, icl_soft_conn_pdu_get_data),
KOBJMETHOD(icl_conn_pdu_queue, icl_soft_conn_pdu_queue),
KOBJMETHOD(icl_conn_pdu_queue_cb, icl_soft_conn_pdu_queue_cb),
KOBJMETHOD(icl_conn_handoff, icl_soft_conn_handoff),
KOBJMETHOD(icl_conn_free, icl_soft_conn_free),
KOBJMETHOD(icl_conn_close, icl_soft_conn_close),
@ -209,35 +220,79 @@ icl_conn_receive_buf(struct icl_conn *ic, void *buf, size_t len)
static void
icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
{
struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
KASSERT(isp->ref_cnt == 0, ("freeing active PDU"));
m_freem(ip->ip_bhs_mbuf);
m_freem(ip->ip_ahs_mbuf);
m_freem(ip->ip_data_mbuf);
uma_zfree(icl_pdu_zone, ip);
uma_zfree(icl_soft_pdu_zone, isp);
#ifdef DIAGNOSTIC
refcount_release(&ic->ic_outstanding_pdus);
#endif
}
static void
icl_soft_pdu_call_cb(struct icl_pdu *ip)
{
struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
if (isp->cb != NULL)
isp->cb(ip, isp->error);
#ifdef DIAGNOSTIC
refcount_release(&ip->ip_conn->ic_outstanding_pdus);
#endif
uma_zfree(icl_soft_pdu_zone, isp);
}
static void
icl_soft_pdu_done(struct icl_pdu *ip, int error)
{
struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
if (error != 0)
isp->error = error;
m_freem(ip->ip_bhs_mbuf);
ip->ip_bhs_mbuf = NULL;
m_freem(ip->ip_ahs_mbuf);
ip->ip_ahs_mbuf = NULL;
m_freem(ip->ip_data_mbuf);
ip->ip_data_mbuf = NULL;
if (atomic_fetchadd_int(&isp->ref_cnt, -1) == 1)
icl_soft_pdu_call_cb(ip);
}
static void
icl_soft_mbuf_done(struct mbuf *mb)
{
struct icl_soft_pdu *isp = (struct icl_soft_pdu *)mb->m_ext.ext_arg1;
icl_soft_pdu_call_cb(&isp->ip);
}
/*
* Allocate icl_pdu with empty BHS to fill up by the caller.
*/
struct icl_pdu *
icl_soft_conn_new_pdu(struct icl_conn *ic, int flags)
{
struct icl_soft_pdu *isp;
struct icl_pdu *ip;
#ifdef DIAGNOSTIC
refcount_acquire(&ic->ic_outstanding_pdus);
#endif
ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
if (ip == NULL) {
ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
isp = uma_zalloc(icl_soft_pdu_zone, flags | M_ZERO);
if (isp == NULL) {
ICL_WARN("failed to allocate soft PDU");
#ifdef DIAGNOSTIC
refcount_release(&ic->ic_outstanding_pdus);
#endif
return (NULL);
}
ip = &isp->ip;
ip->ip_conn = ic;
CTASSERT(sizeof(struct iscsi_bhs) <= MHLEN);
@ -926,7 +981,7 @@ icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
if (error != 0) {
ICL_DEBUG("failed to finalize PDU; "
"dropping connection");
icl_soft_conn_pdu_free(ic, request);
icl_soft_pdu_done(request, EIO);
icl_conn_fail(ic);
return;
}
@ -944,8 +999,8 @@ icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
if (error != 0) {
ICL_DEBUG("failed to finalize PDU; "
"dropping connection");
icl_soft_conn_pdu_free(ic, request);
icl_soft_conn_pdu_free(ic, request2);
icl_soft_pdu_done(request, EIO);
icl_soft_pdu_done(request2, EIO);
icl_conn_fail(ic);
return;
}
@ -954,7 +1009,7 @@ icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
request->ip_bhs_mbuf->m_pkthdr.len += size2;
size += size2;
STAILQ_REMOVE_AFTER(queue, request, ip_next);
icl_soft_conn_pdu_free(ic, request2);
icl_soft_pdu_done(request2, 0);
coalesced++;
}
#if 0
@ -971,11 +1026,11 @@ icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
if (error != 0) {
ICL_DEBUG("failed to send PDU, error %d; "
"dropping connection", error);
icl_soft_conn_pdu_free(ic, request);
icl_soft_pdu_done(request, error);
icl_conn_fail(ic);
return;
}
icl_soft_conn_pdu_free(ic, request);
icl_soft_pdu_done(request, 0);
}
}
@ -1072,24 +1127,38 @@ static int
icl_soft_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
const void *addr, size_t len, int flags)
{
struct icl_soft_pdu *isp = (struct icl_soft_pdu *)request;
struct mbuf *mb, *newmb;
size_t copylen, off = 0;
KASSERT(len > 0, ("len == 0"));
newmb = m_getm2(NULL, len, flags, MT_DATA, 0);
if (newmb == NULL) {
ICL_WARN("failed to allocate mbuf for %zd bytes", len);
return (ENOMEM);
}
if (flags & ICL_NOCOPY) {
newmb = m_get(flags & ~ICL_NOCOPY, MT_DATA);
if (newmb == NULL) {
ICL_WARN("failed to allocate mbuf");
return (ENOMEM);
}
for (mb = newmb; mb != NULL; mb = mb->m_next) {
copylen = min(M_TRAILINGSPACE(mb), len - off);
memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
mb->m_len = copylen;
off += copylen;
newmb->m_flags |= M_RDONLY;
m_extaddref(newmb, __DECONST(char *, addr), len, &isp->ref_cnt,
icl_soft_mbuf_done, isp, NULL);
newmb->m_len = len;
} else {
newmb = m_getm2(NULL, len, flags, MT_DATA, 0);
if (newmb == NULL) {
ICL_WARN("failed to allocate mbuf for %zd bytes", len);
return (ENOMEM);
}
for (mb = newmb; mb != NULL; mb = mb->m_next) {
copylen = min(M_TRAILINGSPACE(mb), len - off);
memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
mb->m_len = copylen;
off += copylen;
}
KASSERT(off == len, ("%s: off != len", __func__));
}
KASSERT(off == len, ("%s: off != len", __func__));
if (request->ip_data_mbuf == NULL) {
request->ip_data_mbuf = newmb;
@ -1111,17 +1180,25 @@ icl_soft_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
}
static void
icl_pdu_queue(struct icl_pdu *ip)
icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
{
struct icl_conn *ic;
ic = ip->ip_conn;
icl_soft_conn_pdu_queue_cb(ic, ip, NULL);
}
static void
icl_soft_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip,
icl_pdu_cb cb)
{
struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
ICL_CONN_LOCK_ASSERT(ic);
isp->ref_cnt++;
isp->cb = cb;
if (ic->ic_disconnecting || ic->ic_socket == NULL) {
ICL_DEBUG("icl_pdu_queue on closed connection");
icl_soft_conn_pdu_free(ic, ip);
icl_soft_pdu_done(ip, ENOTCONN);
return;
}
@ -1139,13 +1216,6 @@ icl_pdu_queue(struct icl_pdu *ip)
cv_signal(&ic->ic_send_cv);
}
void
icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
{
icl_pdu_queue(ip);
}
static struct icl_conn *
icl_soft_new_conn(const char *name, struct mtx *lock)
{
@ -1414,7 +1484,7 @@ icl_soft_conn_close(struct icl_conn *ic)
while (!STAILQ_EMPTY(&ic->ic_to_send)) {
pdu = STAILQ_FIRST(&ic->ic_to_send);
STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
icl_soft_conn_pdu_free(ic, pdu);
icl_soft_pdu_done(pdu, ENOTCONN);
}
KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
@ -1499,8 +1569,8 @@ icl_soft_load(void)
{
int error;
icl_pdu_zone = uma_zcreate("icl_pdu",
sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
icl_soft_pdu_zone = uma_zcreate("icl_soft_pdu",
sizeof(struct icl_soft_pdu), NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, 0);
refcount_init(&icl_ncons, 0);
@ -1537,7 +1607,7 @@ icl_soft_unload(void)
icl_unregister("proxytest", true);
#endif
uma_zdestroy(icl_pdu_zone);
uma_zdestroy(icl_soft_pdu_zone);
return (0);
}

View File

@ -79,6 +79,13 @@ icl_pdu_queue(struct icl_pdu *ip)
ICL_CONN_PDU_QUEUE(ip->ip_conn, ip);
}
static inline void
icl_pdu_queue_cb(struct icl_pdu *ip, icl_pdu_cb cb)
{
ICL_CONN_PDU_QUEUE_CB(ip->ip_conn, ip, cb);
}
static inline void
icl_pdu_free(struct icl_pdu *ip)
{