nvmf/tcp: zero-copy support

This patch adds support for using zero-copy operations to execute IO
requests in the TCP transport.  Of course, they're only used if the
underlying bdev supports them.  Additionally, only requests with no
in-capsule-data can be executed using this mechanism.

Added several new states to accommodate for the difference in a way
zero-copy is handled.  Also, these flows very depending on the type of a
request (read or write).  It stems from zero-copy semantics: to perform
a write we need to wait for zcopy_end completion, while for reads
zcopy_end can only be submitted once we send all of the requested data
to the host.

Signed-off-by: Konrad Sztyber <konrad.sztyber@intel.com>
Change-Id: Ie02b494c24bc1acc98557cb4b02e867abf9064e4
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10796
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
This commit is contained in:
Konrad Sztyber 2021-12-09 16:43:54 +01:00 committed by Tomasz Zawadzki
parent 6631c2a8aa
commit 794d47d4c4

View File

@ -71,6 +71,12 @@ enum spdk_nvmf_tcp_req_state {
/* The request is queued until a data buffer is available. */
TCP_REQUEST_STATE_NEED_BUFFER,
/* The request is waiting for zcopy_start to finish */
TCP_REQUEST_STATE_AWAITING_ZCOPY_START,
/* The request has received a zero-copy buffer */
TCP_REQUEST_STATE_ZCOPY_START_COMPLETED,
/* The request is currently transferring data from the host to the controller. */
TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
@ -83,6 +89,9 @@ enum spdk_nvmf_tcp_req_state {
/* The request is currently executing at the block device */
TCP_REQUEST_STATE_EXECUTING,
/* The request is waiting for zcopy buffers to be commited */
TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT,
/* The request finished executing at the block device */
TCP_REQUEST_STATE_EXECUTED,
@ -92,6 +101,9 @@ enum spdk_nvmf_tcp_req_state {
/* The request is currently transferring final pdus from the controller to the host. */
TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
/* The request is waiting for zcopy buffers to be released (without committing) */
TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE,
/* The request completed and can be marked free. */
TCP_REQUEST_STATE_COMPLETED,
@ -1426,8 +1438,14 @@ nvmf_tcp_capsule_cmd_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport,
tcp_req = nvmf_tcp_req_get(tqpair);
if (!tcp_req) {
/* Directly return and make the allocation retry again */
if (tqpair->state_cntr[TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST] > 0) {
/* Directly return and make the allocation retry again. This can happen if we're
* using asynchronous writes to send the response to the host or when releasing
* zero-copy buffers after a response has been sent. In both cases, the host might
* receive the response before we've finished processing the request and is free to
* send another one.
*/
if (tqpair->state_cntr[TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST] > 0 ||
tqpair->state_cntr[TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE] > 0) {
return;
}
@ -1457,6 +1475,9 @@ nvmf_tcp_capsule_cmd_payload_handle(struct spdk_nvmf_tcp_transport *ttransport,
tcp_req = pdu->req;
assert(tcp_req != NULL);
/* Zero-copy requests don't support ICD */
assert(!spdk_nvmf_request_using_zcopy(&tcp_req->req));
if (capsule_cmd->common.pdo > SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET) {
SPDK_ERRLOG("Expected ICReq capsule_cmd pdu offset <= %d, got %c\n",
SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET, capsule_cmd->common.pdo);
@ -2224,6 +2245,12 @@ nvmf_tcp_req_parse_sgl(struct spdk_nvmf_tcp_req *tcp_req,
req->dif.elba_length = length;
}
if (nvmf_ctrlr_use_zcopy(req)) {
SPDK_DEBUGLOG(nvmf_tcp, "Using zero-copy to execute request %p\n", tcp_req);
req->data_from_pool = false;
return 0;
}
if (spdk_nvmf_request_get_buffers(req, group, transport, length)) {
/* No available buffers. Queue this request up. */
SPDK_DEBUGLOG(nvmf_tcp, "No available large data buffers. Queueing request %p\n",
@ -2608,6 +2635,19 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
break;
}
/* Get a zcopy buffer if the request can be serviced through zcopy */
if (spdk_nvmf_request_using_zcopy(&tcp_req->req)) {
if (spdk_unlikely(tcp_req->req.dif_enabled)) {
assert(tcp_req->req.dif.elba_length >= tcp_req->req.length);
tcp_req->req.length = tcp_req->req.dif.elba_length;
}
STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link);
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_ZCOPY_START);
spdk_nvmf_request_zcopy_start(&tcp_req->req);
break;
}
if (!tcp_req->req.data) {
SPDK_DEBUGLOG(nvmf_tcp, "No buffer allocated for tcp_req(%p) on tqpair(%p\n)",
tcp_req, tqpair);
@ -2640,6 +2680,24 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
break;
case TCP_REQUEST_STATE_AWAITING_ZCOPY_START:
/* Some external code must kick a request into TCP_REQUEST_STATE_ZCOPY_START_COMPLETED
* to escape this state. */
break;
case TCP_REQUEST_STATE_ZCOPY_START_COMPLETED:
if (spdk_unlikely(spdk_nvme_cpl_is_error(&tcp_req->req.rsp->nvme_cpl))) {
SPDK_DEBUGLOG(nvmf_tcp, "Zero-copy start failed for tcp_req(%p) on tqpair=%p\n",
tcp_req, tqpair);
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE);
break;
}
if (tcp_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
SPDK_DEBUGLOG(nvmf_tcp, "Sending R2T for tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair);
nvmf_tcp_send_r2t_pdu(tqpair, tcp_req);
} else {
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED);
}
break;
case TCP_REQUEST_STATE_AWAITING_R2T_ACK:
spdk_trace_record(TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK, 0, 0, (uintptr_t)tcp_req,
tqpair);
@ -2661,14 +2719,26 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
tcp_req->req.length = tcp_req->req.dif.elba_length;
}
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTING);
spdk_nvmf_request_exec(&tcp_req->req);
if (!spdk_nvmf_request_using_zcopy(&tcp_req->req)) {
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTING);
spdk_nvmf_request_exec(&tcp_req->req);
} else {
/* For zero-copy, only requests with data coming from host to the
* controller can end up here. */
assert(tcp_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT);
spdk_nvmf_request_zcopy_end(&tcp_req->req, true);
}
break;
case TCP_REQUEST_STATE_EXECUTING:
spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTING, 0, 0, (uintptr_t)tcp_req, tqpair);
/* Some external code must kick a request into TCP_REQUEST_STATE_EXECUTED
* to escape this state. */
break;
case TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT:
/* Some external code must kick a request into TCP_REQUEST_STATE_EXECUTED
* to escape this state. */
break;
case TCP_REQUEST_STATE_EXECUTED:
spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTED, 0, 0, (uintptr_t)tcp_req, tqpair);
@ -2690,6 +2760,10 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
/* Some external code must kick a request into TCP_REQUEST_STATE_COMPLETED
* to escape this state. */
break;
case TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE:
/* Some external code must kick a request into TCP_REQUEST_STATE_COMPLETED
* to escape this state. */
break;
case TCP_REQUEST_STATE_COMPLETED:
spdk_trace_record(TRACE_TCP_REQUEST_STATE_COMPLETED, 0, 0, (uintptr_t)tcp_req, tqpair);
if (tcp_req->req.data_from_pool) {
@ -2701,6 +2775,15 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
assert(tgroup->control_msg_list);
SPDK_DEBUGLOG(nvmf_tcp, "Put buf to control msg list\n");
nvmf_tcp_control_msg_put(tgroup->control_msg_list, tcp_req->req.data);
} else if (tcp_req->req.zcopy_bdev_io != NULL) {
/* If the request has an unreleased zcopy bdev_io, it's either a
* read or a failed write */
assert(spdk_nvmf_request_using_zcopy(&tcp_req->req));
assert(tcp_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST ||
spdk_nvme_cpl_is_error(&tcp_req->req.rsp->nvme_cpl));
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE);
spdk_nvmf_request_zcopy_end(&tcp_req->req, false);
break;
}
tcp_req->req.length = 0;
tcp_req->req.iovcnt = 0;
@ -2821,7 +2904,22 @@ nvmf_tcp_req_complete(struct spdk_nvmf_request *req)
ttransport = SPDK_CONTAINEROF(req->qpair->transport, struct spdk_nvmf_tcp_transport, transport);
tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED);
switch (tcp_req->state) {
case TCP_REQUEST_STATE_EXECUTING:
case TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT:
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED);
break;
case TCP_REQUEST_STATE_AWAITING_ZCOPY_START:
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_ZCOPY_START_COMPLETED);
break;
case TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE:
nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED);
break;
default:
assert(0 && "Unexpected request state");
break;
}
nvmf_tcp_req_process(ttransport, tcp_req);
return 0;
@ -2960,6 +3058,8 @@ _nvmf_tcp_qpair_abort_request(void *ctx)
switch (tcp_req_to_abort->state) {
case TCP_REQUEST_STATE_EXECUTING:
case TCP_REQUEST_STATE_AWAITING_ZCOPY_START:
case TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT:
rc = nvmf_ctrlr_abort_request(req);
if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) {
return SPDK_POLLER_BUSY;
@ -2975,6 +3075,7 @@ _nvmf_tcp_qpair_abort_request(void *ctx)
break;
case TCP_REQUEST_STATE_AWAITING_R2T_ACK:
case TCP_REQUEST_STATE_ZCOPY_START_COMPLETED:
nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort);
break;