cxgbei: Hardware accelerated iSCSI target and initiator for TOE capable

cards supported by cxgbe(4).

On the host side this driver interfaces with the storage stack via the
ICL (iSCSI Common Layer) in the kernel.  On the wire the traffic is
standard iSCSI (SCSI over TCP as per RFC 3720/7143 etc.) that
interoperates with all other standards compliant implementations.  The
driver is layered on top of the TOE driver (t4_tom) and promotes
connections being handled by t4_tom to iSCSI ULP (Upper Layer Protocol)
mode.  Hardware assistance in this mode includes:

- Full TCP processing.
- iSCSI PDU identification and recovery within the TCP stream.
- Header and/or data digest insertion (tx) and verification (rx).
- Zero copy (both tx and rx).

Man page will follow in a separate commit in a couple of weeks.

Relnotes:	Yes
Sponsored by:	Chelsio Communications
This commit is contained in:
Navdeep Parhar 2015-12-26 06:05:21 +00:00
parent 66e979f15c
commit e3148e46b2
7 changed files with 2853 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,167 @@
/*-
* Copyright (c) 2012, 2015 Chelsio Communications, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*
*/
#ifndef __CXGBEI_OFLD_H__
#define __CXGBEI_OFLD_H__
#include <dev/iscsi/icl.h>
enum {
CWT_SLEEPING = 1,
CWT_RUNNING = 2,
CWT_STOP = 3,
CWT_STOPPED = 4,
};
struct cxgbei_worker_thread_softc {
struct mtx cwt_lock;
struct cv cwt_cv;
volatile int cwt_state;
TAILQ_HEAD(, icl_cxgbei_conn) rx_head;
} __aligned(CACHE_LINE_SIZE);
#define CXGBEI_CONN_SIGNATURE 0x56788765
enum {
RXF_ACTIVE = 1 << 0, /* In the worker thread's queue */
};
struct icl_cxgbei_conn {
struct icl_conn ic;
/* cxgbei specific stuff goes here. */
uint32_t icc_signature;
int ulp_submode;
struct adapter *sc;
struct toepcb *toep;
/* Receive related. */
u_int rx_flags; /* protected by so_rcv lock */
u_int cwt;
STAILQ_HEAD(, icl_pdu) rcvd_pdus; /* protected by so_rcv lock */
TAILQ_ENTRY(icl_cxgbei_conn) rx_link; /* protected by cwt lock */
};
static inline struct icl_cxgbei_conn *
ic_to_icc(struct icl_conn *ic)
{
return (__containerof(ic, struct icl_cxgbei_conn, ic));
}
#define CXGBEI_PDU_SIGNATURE 0x12344321
struct icl_cxgbei_pdu {
struct icl_pdu ip;
/* cxgbei specific stuff goes here. */
uint32_t icp_signature;
uint32_t pdu_seq; /* For debug only */
u_int pdu_flags;
};
static inline struct icl_cxgbei_pdu *
ip_to_icp(struct icl_pdu *ip)
{
return (__containerof(ip, struct icl_cxgbei_pdu, ip));
}
struct cxgbei_sgl {
int sg_flag;
void *sg_addr;
void *sg_dma_addr;
size_t sg_offset;
size_t sg_length;
};
#define cxgbei_scsi_for_each_sg(_sgl, _sgel, _n, _i) \
for (_i = 0, _sgel = (cxgbei_sgl*) (_sgl); _i < _n; _i++, \
_sgel++)
#define sg_dma_addr(_sgel) _sgel->sg_dma_addr
#define sg_virt(_sgel) _sgel->sg_addr
#define sg_len(_sgel) _sgel->sg_length
#define sg_off(_sgel) _sgel->sg_offset
#define sg_next(_sgel) _sgel + 1
#define SBUF_ULP_FLAG_HDR_RCVD 0x1
#define SBUF_ULP_FLAG_DATA_RCVD 0x2
#define SBUF_ULP_FLAG_STATUS_RCVD 0x4
#define SBUF_ULP_FLAG_HCRC_ERROR 0x10
#define SBUF_ULP_FLAG_DCRC_ERROR 0x20
#define SBUF_ULP_FLAG_PAD_ERROR 0x40
#define SBUF_ULP_FLAG_DATA_DDPED 0x80
/* private data for each scsi task */
struct cxgbei_task_data {
struct cxgbei_sgl sgl[256];
u_int nsge;
u_int sc_ddp_tag;
};
struct cxgbei_ulp2_tag_format {
u_char sw_bits;
u_char rsvd_bits;
u_char rsvd_shift;
u_char filler[1];
uint32_t rsvd_mask;
};
struct cxgbei_data {
u_int max_txsz;
u_int max_rxsz;
u_int llimit;
u_int ulimit;
u_int nppods;
u_int idx_last;
u_char idx_bits;
uint32_t idx_mask;
uint32_t rsvd_tag_mask;
struct mtx map_lock;
bus_dma_tag_t ulp_ddp_tag;
unsigned char *colors;
struct cxgbei_ulp2_gather_list **gl_map;
struct cxgbei_ulp2_tag_format tag_format;
};
void cxgbei_conn_task_reserve_itt(void *, void **, void *, unsigned int *);
void cxgbei_conn_transfer_reserve_ttt(void *, void **, void *, unsigned int *);
void cxgbei_cleanup_task(void *, void *);
u_int cxgbei_select_worker_thread(struct icl_cxgbei_conn *);
struct cxgbei_ulp2_pagepod_hdr;
int t4_ddp_set_map(struct cxgbei_data *, void *,
struct cxgbei_ulp2_pagepod_hdr *, u_int, u_int,
struct cxgbei_ulp2_gather_list *, int);
void t4_ddp_clear_map(struct cxgbei_data *, struct cxgbei_ulp2_gather_list *,
u_int, u_int, u_int, struct icl_cxgbei_conn *);
#endif

View File

@ -0,0 +1,417 @@
/*-
* Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
*
* Chelsio T5xx iSCSI driver
* cxgbei_ulp2_ddp.c: Chelsio iSCSI DDP Manager.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#ifdef TCP_OFFLOAD
#include <sys/types.h>
#include <sys/module.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/mbuf.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/toecore.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
#include <dev/iscsi/icl.h>
#include <dev/iscsi/iscsi_proto.h>
#include "common/common.h"
#include "common/t4_msg.h"
#include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */
#include "tom/t4_tom.h"
#include "cxgbei.h"
#include "cxgbei_ulp2_ddp.h"
/*
* Map a single buffer address.
*/
static void
ulp2_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
{
bus_addr_t *ba = arg;
if (error)
return;
KASSERT(nseg == 1, ("%s: %d segments returned!", __func__, nseg));
*ba = segs->ds_addr;
}
/*
* iSCSI Direct Data Placement
*
* T4/5 ulp2 h/w can directly place the iSCSI Data-In or Data-Out PDU's
* payload into pre-posted final destination host-memory buffers based on the
* Initiator Task Tag (ITT) in Data-In or Target Task Tag (TTT) in Data-Out
* PDUs.
*
* The host memory address is programmed into h/w in the format of pagepod
* entries.
* The location of the pagepod entry is encoded into ddp tag which is used or
* is the base for ITT/TTT.
*/
static inline int
ddp_find_unused_entries(struct cxgbei_data *ci, u_int start, u_int max,
u_int count, u_int *idx, struct cxgbei_ulp2_gather_list *gl)
{
unsigned int i, j, k;
/* not enough entries */
if (max - start < count)
return (EBUSY);
max -= count;
mtx_lock(&ci->map_lock);
for (i = start; i < max;) {
for (j = 0, k = i; j < count; j++, k++) {
if (ci->gl_map[k])
break;
}
if (j == count) {
for (j = 0, k = i; j < count; j++, k++)
ci->gl_map[k] = gl;
mtx_unlock(&ci->map_lock);
*idx = i;
return (0);
}
i += j + 1;
}
mtx_unlock(&ci->map_lock);
return (EBUSY);
}
static inline void
ddp_unmark_entries(struct cxgbei_data *ci, u_int start, u_int count)
{
mtx_lock(&ci->map_lock);
memset(&ci->gl_map[start], 0,
count * sizeof(struct cxgbei_ulp2_gather_list *));
mtx_unlock(&ci->map_lock);
}
static inline void
ddp_gl_unmap(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl)
{
int i;
if (!gl->pages[0])
return;
for (i = 0; i < gl->nelem; i++) {
bus_dmamap_unload(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map);
bus_dmamap_destroy(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map);
}
}
static inline int
ddp_gl_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl)
{
int i, rc;
bus_addr_t pa;
MPASS(ci != NULL);
mtx_lock(&ci->map_lock);
for (i = 0; i < gl->nelem; i++) {
rc = bus_dmamap_create(ci->ulp_ddp_tag, 0,
&gl->dma_sg[i].bus_map);
if (rc != 0)
goto unmap;
rc = bus_dmamap_load(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map,
gl->pages[i], PAGE_SIZE, ulp2_dma_map_addr,
&pa, BUS_DMA_NOWAIT);
if (rc != 0)
goto unmap;
gl->dma_sg[i].phys_addr = pa;
}
mtx_unlock(&ci->map_lock);
return (0);
unmap:
if (i) {
u_int nelem = gl->nelem;
gl->nelem = i;
ddp_gl_unmap(ci, gl);
gl->nelem = nelem;
}
return (ENOMEM);
}
/**
* cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec - build ddp page buffer list
* @xferlen: total buffer length
* @sgl: page buffer scatter-gather list (struct cxgbei_sgl)
* @sgcnt: # of page buffers
* @gfp: allocation mode
*
* construct a ddp page buffer list from the scsi scattergather list.
* coalesce buffers as much as possible, and obtain dma addresses for
* each page.
*
* Return the cxgbei_ulp2_gather_list constructed from the page buffers if the
* memory can be used for ddp. Return NULL otherwise.
*/
struct cxgbei_ulp2_gather_list *
cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(u_int xferlen, struct cxgbei_sgl *sgl,
u_int sgcnt, struct cxgbei_data *ci, int gfp)
{
struct cxgbei_ulp2_gather_list *gl;
struct cxgbei_sgl *sg = sgl;
void *sgpage = (void *)((u64)sg->sg_addr & (~PAGE_MASK));
unsigned int sglen = sg->sg_length;
unsigned int sgoffset = (u64)sg->sg_addr & PAGE_MASK;
unsigned int npages = (xferlen + sgoffset + PAGE_SIZE - 1) >>
PAGE_SHIFT;
int i = 1, j = 0;
if (xferlen <= DDP_THRESHOLD) {
CTR2(KTR_CXGBE, "xfer %u < threshold %u, no ddp.",
xferlen, DDP_THRESHOLD);
return NULL;
}
gl = malloc(sizeof(struct cxgbei_ulp2_gather_list) +
npages * (sizeof(struct dma_segments) + sizeof(void *)),
M_DEVBUF, M_NOWAIT | M_ZERO);
if (gl == NULL)
return (NULL);
gl->pages = (void **)&gl->dma_sg[npages];
gl->length = xferlen;
gl->offset = sgoffset;
gl->pages[0] = sgpage;
CTR6(KTR_CXGBE,
"%s: xferlen:0x%x len:0x%x off:0x%x sg_addr:%p npages:%d",
__func__, xferlen, gl->length, gl->offset, sg->sg_addr, npages);
for (i = 1, sg = sg_next(sg); i < sgcnt; i++, sg = sg_next(sg)) {
void *page = sg->sg_addr;
if (sgpage == page && sg->sg_offset == sgoffset + sglen)
sglen += sg->sg_length;
else {
/* make sure the sgl is fit for ddp:
* each has the same page size, and
* all of the middle pages are used completely
*/
if ((j && sgoffset) ||
((i != sgcnt - 1) &&
((sglen + sgoffset) & ~CXGBEI_PAGE_MASK))){
goto error_out;
}
j++;
if (j == gl->nelem || sg->sg_offset) {
goto error_out;
}
gl->pages[j] = page;
sglen = sg->sg_length;
sgoffset = sg->sg_offset;
sgpage = page;
}
}
gl->nelem = ++j;
if (ddp_gl_map(ci, gl) < 0)
goto error_out;
return gl;
error_out:
free(gl, M_DEVBUF);
return NULL;
}
/**
* cxgbei_ulp2_ddp_release_gl - release a page buffer list
* @gl: a ddp page buffer list
* @pdev: pci_dev used for pci_unmap
* free a ddp page buffer list resulted from cxgbei_ulp2_ddp_make_gl().
*/
void
cxgbei_ulp2_ddp_release_gl(struct cxgbei_data *ci,
struct cxgbei_ulp2_gather_list *gl)
{
ddp_gl_unmap(ci, gl);
free(gl, M_DEVBUF);
}
/**
* cxgbei_ulp2_ddp_tag_reserve - set up ddp for a data transfer
* @ci: adapter's ddp info
* @tid: connection id
* @tformat: tag format
* @tagp: contains s/w tag initially, will be updated with ddp/hw tag
* @gl: the page momory list
* @gfp: allocation mode
*
* ddp setup for a given page buffer list and construct the ddp tag.
* return 0 if success, < 0 otherwise.
*/
int
cxgbei_ulp2_ddp_tag_reserve(struct cxgbei_data *ci, void *icc, u_int tid,
struct cxgbei_ulp2_tag_format *tformat, u32 *tagp,
struct cxgbei_ulp2_gather_list *gl, int gfp, int reply)
{
struct cxgbei_ulp2_pagepod_hdr hdr;
u_int npods, idx;
int rc;
u32 sw_tag = *tagp;
u32 tag;
MPASS(ci != NULL);
if (!gl || !gl->nelem || gl->length < DDP_THRESHOLD)
return (EINVAL);
npods = (gl->nelem + IPPOD_PAGES_MAX - 1) >> IPPOD_PAGES_SHIFT;
if (ci->idx_last == ci->nppods)
rc = ddp_find_unused_entries(ci, 0, ci->nppods, npods, &idx,
gl);
else {
rc = ddp_find_unused_entries(ci, ci->idx_last + 1,
ci->nppods, npods, &idx, gl);
if (rc && ci->idx_last >= npods) {
rc = ddp_find_unused_entries(ci, 0,
min(ci->idx_last + npods, ci->nppods),
npods, &idx, gl);
}
}
if (rc) {
CTR3(KTR_CXGBE, "xferlen %u, gl %u, npods %u NO DDP.",
gl->length, gl->nelem, npods);
return (rc);
}
tag = cxgbei_ulp2_ddp_tag_base(idx, ci->colors, tformat, sw_tag);
CTR4(KTR_CXGBE, "%s: sw_tag:0x%x idx:0x%x tag:0x%x",
__func__, sw_tag, idx, tag);
hdr.rsvd = 0;
hdr.vld_tid = htonl(F_IPPOD_VALID | V_IPPOD_TID(tid));
hdr.pgsz_tag_clr = htonl(tag & ci->rsvd_tag_mask);
hdr.maxoffset = htonl(gl->length);
hdr.pgoffset = htonl(gl->offset);
rc = t4_ddp_set_map(ci, icc, &hdr, idx, npods, gl, reply);
if (rc < 0)
goto unmark_entries;
ci->idx_last = idx;
*tagp = tag;
return (0);
unmark_entries:
ddp_unmark_entries(ci, idx, npods);
return (rc);
}
/**
* cxgbei_ulp2_ddp_tag_release - release a ddp tag
* @ci: adapter's ddp info
* @tag: ddp tag
* ddp cleanup for a given ddp tag and release all the resources held
*/
void
cxgbei_ulp2_ddp_tag_release(struct cxgbei_data *ci, uint32_t tag,
struct icl_cxgbei_conn *icc)
{
uint32_t idx;
MPASS(ci != NULL);
MPASS(icc != NULL);
idx = (tag >> IPPOD_IDX_SHIFT) & ci->idx_mask;
CTR3(KTR_CXGBE, "tag:0x%x idx:0x%x nppods:0x%x",
tag, idx, ci->nppods);
if (idx < ci->nppods) {
struct cxgbei_ulp2_gather_list *gl = ci->gl_map[idx];
unsigned int npods;
if (!gl || !gl->nelem) {
CTR4(KTR_CXGBE,
"release 0x%x, idx 0x%x, gl 0x%p, %u.",
tag, idx, gl, gl ? gl->nelem : 0);
return;
}
npods = (gl->nelem + IPPOD_PAGES_MAX - 1) >> IPPOD_PAGES_SHIFT;
CTR3(KTR_CXGBE, "ddp tag 0x%x, release idx 0x%x, npods %u.",
tag, idx, npods);
t4_ddp_clear_map(ci, gl, tag, idx, npods, icc);
ddp_unmark_entries(ci, idx, npods);
cxgbei_ulp2_ddp_release_gl(ci, gl);
} else
CTR3(KTR_CXGBE, "ddp tag 0x%x, idx 0x%x > max 0x%x.",
tag, idx, ci->nppods);
}
/**
* cxgbei_ddp_cleanup - release the adapter's ddp resources
*/
void
cxgbei_ddp_cleanup(struct cxgbei_data *ci)
{
int i = 0;
while (i < ci->nppods) {
struct cxgbei_ulp2_gather_list *gl = ci->gl_map[i];
if (gl) {
int npods = (gl->nelem + IPPOD_PAGES_MAX - 1)
>> IPPOD_PAGES_SHIFT;
free(gl, M_DEVBUF);
i += npods;
} else
i++;
}
free(ci->colors, M_CXGBE);
free(ci->gl_map, M_CXGBE);
}
#endif

View File

@ -0,0 +1,217 @@
/*-
* Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
*
* Chelsio T5xx iSCSI driver
* cxgbei_ulp2_ddp.c: Chelsio iSCSI DDP Manager.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*
*/
#ifndef __CXGBEI_ULP2_DDP_H__
#define __CXGBEI_ULP2_DDP_H__
#define CXGBEI_PAGE_MASK (~(PAGE_SIZE-1))
#define DDP_THRESHOLD 2048
/*
* cxgbei ddp tag are 32 bits, it consists of reserved bits used by h/w and
* non-reserved bits that can be used by the iscsi s/w.
* The reserved bits are identified by the rsvd_bits and rsvd_shift fields
* in struct cxgbei_ulp2_tag_format.
*
* The upper most reserved bit can be used to check if a tag is ddp tag or not:
* if the bit is 0, the tag is a valid ddp tag
*/
/*
* cxgbei_ulp2_is_ddp_tag - check if a given tag is a hw/ddp tag
* @tformat: tag format information
* @tag: tag to be checked
*
* return true if the tag is a ddp tag, false otherwise.
*/
static inline int
cxgbei_ulp2_is_ddp_tag(struct cxgbei_ulp2_tag_format *tformat, uint32_t tag)
{
return (!(tag & (1 << (tformat->rsvd_bits + tformat->rsvd_shift - 1))));
}
/*
* cxgbei_ulp2_sw_tag_usable - check if s/w tag has enough bits left for hw bits
* @tformat: tag format information
* @sw_tag: s/w tag to be checked
*
* return true if the tag can be used for hw ddp tag, false otherwise.
*/
static inline int
cxgbei_ulp2_sw_tag_usable(struct cxgbei_ulp2_tag_format *tformat,
uint32_t sw_tag)
{
return (1); /* XXXNP: huh? */
sw_tag >>= (32 - tformat->rsvd_bits + tformat->rsvd_shift);
return !sw_tag;
}
/*
* cxgbei_ulp2_set_non_ddp_tag - mark a given s/w tag as an invalid ddp tag
* @tformat: tag format information
* @sw_tag: s/w tag to be checked
*
* insert 1 at the upper most reserved bit to mark it as an invalid ddp tag.
*/
static inline uint32_t
cxgbei_ulp2_set_non_ddp_tag(struct cxgbei_ulp2_tag_format *tformat,
uint32_t sw_tag)
{
uint32_t rsvd_bits = tformat->rsvd_bits + tformat->rsvd_shift;
if (sw_tag) {
u32 v1 = sw_tag & ((1 << (rsvd_bits - 1)) - 1);
u32 v2 = (sw_tag >> (rsvd_bits - 1)) << rsvd_bits;
return v2 | (1 << (rsvd_bits - 1)) | v1;
}
return sw_tag | (1 << (rsvd_bits - 1)) ;
}
struct dma_segments {
bus_dmamap_t bus_map;
bus_addr_t phys_addr;
};
/*
* struct cxgbei_ulp2_gather_list - cxgbei direct data placement memory
*
* @tag: ddp tag
* @length: total data buffer length
* @offset: initial offset to the 1st page
* @nelem: # of pages
* @pages: page pointers
* @phys_addr: physical address
*/
struct cxgbei_ulp2_gather_list {
uint32_t tag;
uint32_t tid;
uint32_t port_id;
void *egress_dev;
unsigned int length;
unsigned int offset;
unsigned int nelem;
bus_size_t mapsize;
bus_dmamap_t bus_map;
bus_dma_segment_t *segments;
void **pages;
struct dma_segments dma_sg[0];
};
#define IPPOD_SIZE sizeof(struct cxgbei_ulp2_pagepod) /* 64 */
#define IPPOD_SIZE_SHIFT 6
#define IPPOD_COLOR_SHIFT 0
#define IPPOD_COLOR_SIZE 6
#define IPPOD_COLOR_MASK ((1 << IPPOD_COLOR_SIZE) - 1)
#define IPPOD_IDX_SHIFT IPPOD_COLOR_SIZE
#define IPPOD_IDX_MAX_SIZE 24
#define S_IPPOD_TID 0
#define M_IPPOD_TID 0xFFFFFF
#define V_IPPOD_TID(x) ((x) << S_IPPOD_TID)
#define S_IPPOD_VALID 24
#define V_IPPOD_VALID(x) ((x) << S_IPPOD_VALID)
#define F_IPPOD_VALID V_IPPOD_VALID(1U)
#define S_IPPOD_COLOR 0
#define M_IPPOD_COLOR 0x3F
#define V_IPPOD_COLOR(x) ((x) << S_IPPOD_COLOR)
#define S_IPPOD_TAG 6
#define M_IPPOD_TAG 0xFFFFFF
#define V_IPPOD_TAG(x) ((x) << S_IPPOD_TAG)
#define S_IPPOD_PGSZ 30
#define M_IPPOD_PGSZ 0x3
#define V_IPPOD_PGSZ(x) ((x) << S_IPPOD_PGSZ)
static inline uint32_t
cxgbei_ulp2_ddp_tag_base(u_int idx, u_char *colors,
struct cxgbei_ulp2_tag_format *tformat, uint32_t sw_tag)
{
if (__predict_false(++colors[idx] == 1 << IPPOD_IDX_SHIFT))
colors[idx] = 0;
sw_tag <<= tformat->rsvd_bits + tformat->rsvd_shift;
return (sw_tag | idx << IPPOD_IDX_SHIFT | colors[idx]);
}
#define ISCSI_PDU_NONPAYLOAD_LEN 312 /* bhs(48) + ahs(256) + digest(8) */
/*
* align pdu size to multiple of 512 for better performance
*/
#define cxgbei_align_pdu_size(n) do { n = (n) & (~511); } while (0)
#define ULP2_MAX_PKT_SIZE 16224
#define ULP2_MAX_PDU_PAYLOAD (ULP2_MAX_PKT_SIZE - ISCSI_PDU_NONPAYLOAD_LEN)
#define IPPOD_PAGES_MAX 4
#define IPPOD_PAGES_SHIFT 2 /* 4 pages per pod */
/*
* struct pagepod_hdr, pagepod - pagepod format
*/
struct cxgbei_ulp2_pagepod_hdr {
uint32_t vld_tid;
uint32_t pgsz_tag_clr;
uint32_t maxoffset;
uint32_t pgoffset;
uint64_t rsvd;
};
struct cxgbei_ulp2_pagepod {
struct cxgbei_ulp2_pagepod_hdr hdr;
uint64_t addr[IPPOD_PAGES_MAX + 1];
};
int cxgbei_ulp2_ddp_tag_reserve(struct cxgbei_data *, void *, unsigned int,
struct cxgbei_ulp2_tag_format *, uint32_t *,
struct cxgbei_ulp2_gather_list *, int , int );
void cxgbei_ulp2_ddp_tag_release(struct cxgbei_data *, uint32_t,
struct icl_cxgbei_conn *);
struct cxgbei_ulp2_gather_list *cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(u_int,
struct cxgbei_sgl *, u_int, struct cxgbei_data *, int);
void cxgbei_ulp2_ddp_release_gl(struct cxgbei_data *,
struct cxgbei_ulp2_gather_list *);
int cxgbei_ulp2_ddp_find_page_index(u_long);
int cxgbei_ulp2_adapter_ddp_info(struct cxgbei_data *,
struct cxgbei_ulp2_tag_format *);
void cxgbei_ddp_cleanup(struct cxgbei_data *);
#endif

View File

@ -0,0 +1,896 @@
/*-
* Copyright (c) 2012 The FreeBSD Foundation
* Copyright (c) 2015 Chelsio Communications, Inc.
* All rights reserved.
*
* This software was developed by Edward Tomasz Napierala under sponsorship
* from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
/*
* cxgbei implementation of iSCSI Common Layer kobj(9) interface.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#ifdef TCP_OFFLOAD
#include <sys/param.h>
#include <sys/capsicum.h>
#include <sys/condvar.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/module.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/uio.h>
#include <machine/bus.h>
#include <vm/uma.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
#include <dev/iscsi/icl.h>
#include <dev/iscsi/iscsi_proto.h>
#include <icl_conn_if.h>
#include "common/common.h"
#include "tom/t4_tom.h"
#include "cxgbei.h"
SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD, 0, "Chelsio iSCSI offload");
static int coalesce = 1;
SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN,
&coalesce, 0, "Try to coalesce PDUs before sending");
static int partial_receive_len = 128 * 1024;
SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
&partial_receive_len, 0, "Minimum read size for partially received "
"data segment");
static int sendspace = 1048576;
SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN,
&sendspace, 0, "Default send socket buffer size");
static int recvspace = 1048576;
SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN,
&recvspace, 0, "Default receive socket buffer size");
static uma_zone_t icl_transfer_zone;
static volatile u_int icl_cxgbei_ncons;
#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock)
#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock)
#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED)
#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED)
struct icl_pdu *icl_cxgbei_new_pdu(int);
void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu;
icl_conn_pdu_free_t icl_cxgbei_conn_pdu_free;
static icl_conn_pdu_data_segment_length_t
icl_cxgbei_conn_pdu_data_segment_length;
static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data;
static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data;
static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue;
static icl_conn_handoff_t icl_cxgbei_conn_handoff;
static icl_conn_free_t icl_cxgbei_conn_free;
static icl_conn_close_t icl_cxgbei_conn_close;
static icl_conn_task_setup_t icl_cxgbei_conn_task_setup;
static icl_conn_task_done_t icl_cxgbei_conn_task_done;
static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup;
static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done;
static kobj_method_t icl_cxgbei_methods[] = {
KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu),
KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free),
KOBJMETHOD(icl_conn_pdu_data_segment_length,
icl_cxgbei_conn_pdu_data_segment_length),
KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data),
KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data),
KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue),
KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff),
KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free),
KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close),
KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup),
KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done),
KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup),
KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done),
{ 0, 0 }
};
DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn));
#if 0
/*
* Subtract another 256 for AHS from MAX_DSL if AHS could be used.
*/
#define CXGBEI_MAX_PDU 16224
#define CXGBEI_MAX_DSL (CXGBEI_MAX_PDU - sizeof(struct iscsi_bhs) - 8)
#endif
#define CXGBEI_MAX_DSL 8192
#define CXGBEI_MAX_PDU (CXGBEI_MAX_DSL + sizeof(struct iscsi_bhs) + 8)
void
icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
{
#ifdef INVARIANTS
struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
#endif
MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
MPASS(ic == ip->ip_conn);
MPASS(ip->ip_bhs_mbuf != NULL);
m_freem(ip->ip_ahs_mbuf);
m_freem(ip->ip_data_mbuf);
m_freem(ip->ip_bhs_mbuf); /* storage for icl_cxgbei_pdu itself */
#ifdef DIAGNOSTIC
if (__predict_true(ic != NULL))
refcount_release(&ic->ic_outstanding_pdus);
#endif
}
struct icl_pdu *
icl_cxgbei_new_pdu(int flags)
{
struct icl_cxgbei_pdu *icp;
struct icl_pdu *ip;
struct mbuf *m;
uintptr_t a;
m = m_gethdr(flags, MT_DATA);
if (__predict_false(m == NULL))
return (NULL);
a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu));
icp = (struct icl_cxgbei_pdu *)a;
bzero(icp, sizeof(*icp));
icp->icp_signature = CXGBEI_PDU_SIGNATURE;
ip = &icp->ip;
ip->ip_bhs_mbuf = m;
a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *));
ip->ip_bhs = (struct iscsi_bhs *)a;
#ifdef INVARIANTS
/* Everything must fit entirely in the mbuf. */
a = (uintptr_t)(ip->ip_bhs + 1);
MPASS(a <= (uintptr_t)m + MSIZE);
#endif
bzero(ip->ip_bhs, sizeof(*ip->ip_bhs));
m->m_data = (void *)ip->ip_bhs;
m->m_len = sizeof(struct iscsi_bhs);
m->m_pkthdr.len = m->m_len;
return (ip);
}
void
icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic)
{
ip->ip_conn = ic;
#ifdef DIAGNOSTIC
refcount_acquire(&ic->ic_outstanding_pdus);
#endif
}
/*
* Allocate icl_pdu with empty BHS to fill up by the caller.
*/
static struct icl_pdu *
icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags)
{
struct icl_pdu *ip;
ip = icl_cxgbei_new_pdu(flags);
if (__predict_false(ip == NULL))
return (NULL);
icl_cxgbei_new_pdu_set_conn(ip, ic);
return (ip);
}
static size_t
icl_pdu_data_segment_length(const struct icl_pdu *request)
{
uint32_t len = 0;
len += request->ip_bhs->bhs_data_segment_len[0];
len <<= 8;
len += request->ip_bhs->bhs_data_segment_len[1];
len <<= 8;
len += request->ip_bhs->bhs_data_segment_len[2];
return (len);
}
size_t
icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic,
const struct icl_pdu *request)
{
return (icl_pdu_data_segment_length(request));
}
static uint32_t
icl_conn_build_tasktag(struct icl_conn *ic, uint32_t tag)
{
return tag;
}
static struct mbuf *
finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
{
struct icl_pdu *ip = &icp->ip;
uint8_t ulp_submode, padding;
struct mbuf *m, *last;
struct iscsi_bhs *bhs;
/*
* Fix up the data segment mbuf first.
*/
m = ip->ip_data_mbuf;
ulp_submode = icc->ulp_submode;
if (m) {
last = m_last(m);
/*
* Round up the data segment to a 4B boundary. Pad with 0 if
* necessary. There will definitely be room in the mbuf.
*/
padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len;
if (padding) {
bzero(mtod(last, uint8_t *) + last->m_len, padding);
last->m_len += padding;
}
} else {
MPASS(ip->ip_data_len == 0);
ulp_submode &= ~ULP_CRC_DATA;
padding = 0;
}
/*
* Now the header mbuf that has the BHS.
*/
m = ip->ip_bhs_mbuf;
MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs));
MPASS(m->m_len == sizeof(struct iscsi_bhs));
bhs = ip->ip_bhs;
bhs->bhs_data_segment_len[2] = ip->ip_data_len;
bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8;
bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16;
/* "Convert" PDU to mbuf chain. Do not use icp/ip after this. */
m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding;
m->m_next = ip->ip_data_mbuf;
set_mbuf_ulp_submode(m, ulp_submode);
#ifdef INVARIANTS
bzero(icp, sizeof(*icp));
#endif
#ifdef DIAGNOSTIC
refcount_release(&icc->ic.ic_outstanding_pdus);
#endif
return (m);
}
int
icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip,
const void *addr, size_t len, int flags)
{
struct mbuf *m;
#ifdef INVARIANTS
struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
#endif
MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
MPASS(ic == ip->ip_conn);
KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
m = ip->ip_data_mbuf;
if (m == NULL) {
m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES);
if (__predict_false(m == NULL))
return (ENOMEM);
ip->ip_data_mbuf = m;
}
if (__predict_true(m_append(m, len, addr) != 0)) {
ip->ip_data_len += len;
MPASS(ip->ip_data_len <= CXGBEI_MAX_DSL);
return (0);
} else {
if (flags & M_WAITOK) {
CXGBE_UNIMPLEMENTED("fail safe append");
}
ip->ip_data_len = m_length(m, NULL);
return (1);
}
}
void
icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
size_t off, void *addr, size_t len)
{
struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
if (icp->pdu_flags & SBUF_ULP_FLAG_DATA_DDPED)
return; /* data is DDP'ed, no need to copy */
m_copydata(ip->ip_data_mbuf, off, len, addr);
}
void
icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
{
struct icl_cxgbei_conn *icc = ic_to_icc(ic);
struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
struct socket *so = ic->ic_socket;
struct toepcb *toep = icc->toep;
struct inpcb *inp;
struct mbuf *m;
MPASS(ic == ip->ip_conn);
MPASS(ip->ip_bhs_mbuf != NULL);
/* The kernel doesn't generate PDUs with AHS. */
MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0);
ICL_CONN_LOCK_ASSERT(ic);
/* NOTE: sowriteable without so_snd lock is a mostly harmless race. */
if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) {
icl_cxgbei_conn_pdu_free(ic, ip);
return;
}
m = finalize_pdu(icc, icp);
M_ASSERTPKTHDR(m);
MPASS((m->m_pkthdr.len & 3) == 0);
MPASS(m->m_pkthdr.len + 8 <= CXGBEI_MAX_PDU);
/*
* Do not get inp from toep->inp as the toepcb might have detached
* already.
*/
inp = sotoinpcb(so);
INP_WLOCK(inp);
if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) ||
__predict_false((toep->flags & TPF_ATTACHED) == 0))
m_freem(m);
else {
mbufq_enqueue(&toep->ulp_pduq, m);
t4_push_pdus(icc->sc, toep, 0);
}
INP_WUNLOCK(inp);
}
static struct icl_conn *
icl_cxgbei_new_conn(const char *name, struct mtx *lock)
{
struct icl_cxgbei_conn *icc;
struct icl_conn *ic;
refcount_acquire(&icl_cxgbei_ncons);
icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE,
M_WAITOK | M_ZERO);
icc->icc_signature = CXGBEI_CONN_SIGNATURE;
STAILQ_INIT(&icc->rcvd_pdus);
ic = &icc->ic;
ic->ic_lock = lock;
/* XXXNP: review. Most of these icl_conn fields aren't really used */
STAILQ_INIT(&ic->ic_to_send);
cv_init(&ic->ic_send_cv, "icl_cxgbei_tx");
cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx");
#ifdef DIAGNOSTIC
refcount_init(&ic->ic_outstanding_pdus, 0);
#endif
ic->ic_max_data_segment_length = CXGBEI_MAX_DSL;
ic->ic_name = name;
ic->ic_offload = "cxgbei";
CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
return (ic);
}
void
icl_cxgbei_conn_free(struct icl_conn *ic)
{
struct icl_cxgbei_conn *icc = ic_to_icc(ic);
MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
cv_destroy(&ic->ic_send_cv);
cv_destroy(&ic->ic_receive_cv);
kobj_delete((struct kobj *)icc, M_CXGBE);
refcount_release(&icl_cxgbei_ncons);
}
static int
icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so)
{
size_t minspace;
struct sockopt opt;
int error, one = 1;
/*
* For sendspace, this is required because the current code cannot
* send a PDU in pieces; thus, the minimum buffer size is equal
* to the maximum PDU size. "+4" is to account for possible padding.
*
* What we should actually do here is to use autoscaling, but set
* some minimal buffer size to "minspace". I don't know a way to do
* that, though.
*/
minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
if (sendspace < minspace)
sendspace = minspace;
if (recvspace < minspace)
recvspace = minspace;
error = soreserve(so, sendspace, recvspace);
if (error != 0) {
icl_cxgbei_conn_close(ic);
return (error);
}
SOCKBUF_LOCK(&so->so_snd);
so->so_snd.sb_flags |= SB_AUTOSIZE;
SOCKBUF_UNLOCK(&so->so_snd);
SOCKBUF_LOCK(&so->so_rcv);
so->so_rcv.sb_flags |= SB_AUTOSIZE;
SOCKBUF_UNLOCK(&so->so_rcv);
/*
* Disable Nagle.
*/
bzero(&opt, sizeof(opt));
opt.sopt_dir = SOPT_SET;
opt.sopt_level = IPPROTO_TCP;
opt.sopt_name = TCP_NODELAY;
opt.sopt_val = &one;
opt.sopt_valsize = sizeof(one);
error = sosetopt(so, &opt);
if (error != 0) {
icl_cxgbei_conn_close(ic);
return (error);
}
return (0);
}
/*
* Request/response structure used to find out the adapter offloading a socket.
*/
struct find_ofld_adapter_rr {
struct socket *so;
struct adapter *sc; /* result */
};
static void
find_offload_adapter(struct adapter *sc, void *arg)
{
struct find_ofld_adapter_rr *fa = arg;
struct socket *so = fa->so;
struct tom_data *td = sc->tom_softc;
struct tcpcb *tp;
struct inpcb *inp;
/* Non-TCP were filtered out earlier. */
MPASS(so->so_proto->pr_protocol == IPPROTO_TCP);
if (fa->sc != NULL)
return; /* Found already. */
if (td == NULL)
return; /* TOE not enabled on this adapter. */
inp = sotoinpcb(so);
INP_WLOCK(inp);
if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
tp = intotcpcb(inp);
if (tp->t_flags & TF_TOE && tp->tod == &td->tod)
fa->sc = sc; /* Found. */
}
INP_WUNLOCK(inp);
}
/* XXXNP: move this to t4_tom. */
static void
send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen)
{
struct wrqe *wr;
struct fw_flowc_wr *flowc;
const u_int nparams = 1;
u_int flowclen;
struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
if (wr == NULL) {
/* XXX */
panic("%s: allocation failure.", __func__);
}
flowc = wrtod(wr);
memset(flowc, 0, wr->wr_len);
flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
V_FW_FLOWC_WR_NPARAMS(nparams));
flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
V_FW_WR_FLOWID(toep->tid));
flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
flowc->mnemval[0].val = htobe32(maxlen);
txsd->tx_credits = howmany(flowclen, 16);
txsd->plen = 0;
KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
("%s: not enough credits (%d)", __func__, toep->tx_credits));
toep->tx_credits -= txsd->tx_credits;
if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
toep->txsd_pidx = 0;
toep->txsd_avail--;
t4_wrq_tx(sc, wr);
}
static void
set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc)
{
uint64_t val = 0;
if (hcrc)
val |= ULP_CRC_HEADER;
if (dcrc)
val |= ULP_CRC_DATA;
val <<= 4;
val |= ULP_MODE_ISCSI;
CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d",
__func__, toep->tid, hcrc, dcrc);
t4_set_tcb_field(sc, toep, 1, 0, 0xfff, val);
}
/*
* XXXNP: Who is responsible for cleaning up the socket if this returns with an
* error? Review all error paths.
*
* XXXNP: What happens to the socket's fd reference if the operation is
* successful, and how does that affect the socket's life cycle?
*/
int
icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
{
struct icl_cxgbei_conn *icc = ic_to_icc(ic);
struct find_ofld_adapter_rr fa;
struct file *fp;
struct socket *so;
struct inpcb *inp;
struct tcpcb *tp;
struct toepcb *toep;
cap_rights_t rights;
int error;
MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
ICL_CONN_LOCK_ASSERT_NOT(ic);
/*
* Steal the socket from userland.
*/
error = fget(curthread, fd,
cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
if (error != 0)
return (error);
if (fp->f_type != DTYPE_SOCKET) {
fdrop(fp, curthread);
return (EINVAL);
}
so = fp->f_data;
if (so->so_type != SOCK_STREAM ||
so->so_proto->pr_protocol != IPPROTO_TCP) {
fdrop(fp, curthread);
return (EINVAL);
}
ICL_CONN_LOCK(ic);
if (ic->ic_socket != NULL) {
ICL_CONN_UNLOCK(ic);
fdrop(fp, curthread);
return (EBUSY);
}
ic->ic_disconnecting = false;
ic->ic_socket = so;
fp->f_ops = &badfileops;
fp->f_data = NULL;
fdrop(fp, curthread);
ICL_CONN_UNLOCK(ic);
/* Find the adapter offloading this socket. */
fa.sc = NULL;
fa.so = so;
t4_iterate(find_offload_adapter, &fa);
if (fa.sc == NULL)
return (EINVAL);
icc->sc = fa.sc;
error = icl_cxgbei_setsockopt(ic, so);
if (error)
return (error);
inp = sotoinpcb(so);
INP_WLOCK(inp);
tp = intotcpcb(inp);
if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
error = EBUSY;
else {
/*
* socket could not have been "unoffloaded" if here.
*/
MPASS(tp->t_flags & TF_TOE);
MPASS(tp->tod != NULL);
MPASS(tp->t_toe != NULL);
toep = tp->t_toe;
MPASS(toep->vi->pi->adapter == icc->sc);
icc->toep = toep;
icc->cwt = cxgbei_select_worker_thread(icc);
icc->ulp_submode = 0;
if (ic->ic_header_crc32c)
icc->ulp_submode |= ULP_CRC_HEADER;
if (ic->ic_data_crc32c)
icc->ulp_submode |= ULP_CRC_DATA;
so->so_options |= SO_NO_DDP;
toep->ulp_mode = ULP_MODE_ISCSI;
toep->ulpcb = icc;
send_iscsi_flowc_wr(icc->sc, toep, CXGBEI_MAX_PDU);
set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c,
ic->ic_data_crc32c);
error = 0;
}
INP_WUNLOCK(inp);
return (error);
}
void
icl_cxgbei_conn_close(struct icl_conn *ic)
{
struct icl_cxgbei_conn *icc = ic_to_icc(ic);
struct icl_pdu *ip;
struct socket *so;
struct sockbuf *sb;
struct inpcb *inp;
struct toepcb *toep = icc->toep;
MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
ICL_CONN_LOCK_ASSERT_NOT(ic);
ICL_CONN_LOCK(ic);
so = ic->ic_socket;
if (ic->ic_disconnecting || so == NULL) {
CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p",
__func__, icc, ic->ic_disconnecting, so);
ICL_CONN_UNLOCK(ic);
return;
}
ic->ic_disconnecting = true;
/* These are unused in this driver right now. */
MPASS(STAILQ_EMPTY(&ic->ic_to_send));
MPASS(ic->ic_receive_pdu == NULL);
#ifdef DIAGNOSTIC
KASSERT(ic->ic_outstanding_pdus == 0,
("destroying session with %d outstanding PDUs",
ic->ic_outstanding_pdus));
#endif
ICL_CONN_UNLOCK(ic);
CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1,
icc);
inp = sotoinpcb(so);
sb = &so->so_rcv;
INP_WLOCK(inp);
if (toep != NULL) { /* NULL if connection was never offloaded. */
toep->ulpcb = NULL;
mbufq_drain(&toep->ulp_pduq);
SOCKBUF_LOCK(sb);
if (icc->rx_flags & RXF_ACTIVE) {
volatile u_int *p = &icc->rx_flags;
SOCKBUF_UNLOCK(sb);
INP_WUNLOCK(inp);
while (*p & RXF_ACTIVE)
pause("conclo", 1);
INP_WLOCK(inp);
SOCKBUF_LOCK(sb);
}
while (!STAILQ_EMPTY(&icc->rcvd_pdus)) {
ip = STAILQ_FIRST(&icc->rcvd_pdus);
STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next);
icl_cxgbei_conn_pdu_free(ic, ip);
}
SOCKBUF_UNLOCK(sb);
}
INP_WUNLOCK(inp);
ICL_CONN_LOCK(ic);
ic->ic_socket = NULL;
ICL_CONN_UNLOCK(ic);
/*
* XXXNP: we should send RST instead of FIN when PDUs held in various
* queues were purged instead of delivered reliably but soabort isn't
* really general purpose and wouldn't do the right thing here.
*/
soclose(so);
}
int
icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct ccb_scsiio *csio,
uint32_t *task_tagp, void **prvp)
{
void *prv;
*task_tagp = icl_conn_build_tasktag(ic, *task_tagp);
prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO);
if (prv == NULL)
return (ENOMEM);
*prvp = prv;
cxgbei_conn_task_reserve_itt(ic, prvp, csio, task_tagp);
return (0);
}
void
icl_cxgbei_conn_task_done(struct icl_conn *ic, void *prv)
{
cxgbei_cleanup_task(ic, prv);
uma_zfree(icl_transfer_zone, prv);
}
int
icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
uint32_t *transfer_tag, void **prvp)
{
void *prv;
*transfer_tag = icl_conn_build_tasktag(ic, *transfer_tag);
prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO);
if (prv == NULL)
return (ENOMEM);
*prvp = prv;
cxgbei_conn_transfer_reserve_ttt(ic, prvp, io, transfer_tag);
return (0);
}
void
icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *prv)
{
cxgbei_cleanup_task(ic, prv);
uma_zfree(icl_transfer_zone, prv);
}
static int
icl_cxgbei_limits(size_t *limitp)
{
*limitp = CXGBEI_MAX_DSL;
return (0);
}
static int
icl_cxgbei_load(void)
{
int error;
icl_transfer_zone = uma_zcreate("icl_transfer",
16 * 1024, NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, 0);
refcount_init(&icl_cxgbei_ncons, 0);
error = icl_register("cxgbei", 100, icl_cxgbei_limits,
icl_cxgbei_new_conn);
KASSERT(error == 0, ("failed to register"));
return (error);
}
static int
icl_cxgbei_unload(void)
{
if (icl_cxgbei_ncons != 0)
return (EBUSY);
icl_unregister("cxgbei");
uma_zdestroy(icl_transfer_zone);
return (0);
}
static int
icl_cxgbei_modevent(module_t mod, int what, void *arg)
{
switch (what) {
case MOD_LOAD:
return (icl_cxgbei_load());
case MOD_UNLOAD:
return (icl_cxgbei_unload());
default:
return (EINVAL);
}
}
moduledata_t icl_cxgbei_data = {
"icl_cxgbei",
icl_cxgbei_modevent,
0
};
DECLARE_MODULE(icl_cxgbei, icl_cxgbei_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
MODULE_DEPEND(icl_cxgbei, icl, 1, 1, 1);
MODULE_VERSION(icl_cxgbei, 1);
#endif

View File

@ -11,9 +11,11 @@ SUBDIR+= t4_firmware
SUBDIR+= t5_firmware
SUBDIR+= ${_tom}
SUBDIR+= ${_iw_cxgbe}
SUBDIR+= ${_cxgbei}
.if ${MACHINE_CPUARCH} == "amd64"
_tom= tom
_cxgbei= cxgbei
.if ${MK_OFED} != "no" || defined(ALL_MODULES)
_iw_cxgbe= iw_cxgbe
.endif

View File

@ -0,0 +1,23 @@
# $FreeBSD$
CXGBE = ${.CURDIR}/../../../dev/cxgbe
.PATH: ${CXGBE}/cxgbei
KMOD= cxgbei
SRCS= cxgbei.c
SRCS+= cxgbei_ulp2_ddp.c
SRCS+= icl_cxgbei.c
SRCS+= bus_if.h
SRCS+= device_if.h
SRCS+= opt_inet.h
SRCS+= opt_inet6.h
SRCS+= pci_if.h
SRCS+= opt_cam.h
SRCS+= icl_conn_if.h
CFLAGS+= -I${CXGBE}
MFILES= kern/bus_if.m kern/device_if.m dev/iscsi/icl_conn_if.m dev/pci/pci_if.m
.include <bsd.kmod.mk>