Maintain and observe a ZBUF_FLAG_IMMUTABLE flag on zero-copy BPF
buffer kernel descriptors, which is used to allow the buffer currently in the BPF "store" position to be assigned to userspace when it fills, even if userspace hasn't acknowledged the buffer in the "hold" position yet. To implement this, notify the buffer model when a buffer becomes full, and check that the store buffer is writable, not just for it being full, before trying to append new packet data. Shared memory buffers will be assigned to userspace at most once per fill, be it in the store or in the hold position. This removes the restriction that at most one shared memory can by owned by userspace, reducing the chances that userspace will need to call select() after acknowledging one buffer in order to wait for the next buffer when under high load. This more fully realizes the goal of zero system calls in order to process a high-speed packet stream from BPF. Update bpf.4 to reflect that both buffers may be owned by userspace at once; caution against assuming this.
This commit is contained in:
parent
08304c1617
commit
a7a91e6592
@ -259,14 +259,14 @@ may be used to sleep awaiting the availbility of a completed buffer.
|
||||
They will return a readable file descriptor when ownership of the next buffer
|
||||
is assigned to user space.
|
||||
.Pp
|
||||
In the current implementation, the kernel will assign ownership of at most
|
||||
one buffer at a time to the user process.
|
||||
The user processes must acknowledge the current buffer in order to be
|
||||
notified that the next buffer is ready for processing.
|
||||
Programs should not rely on this as an invariant, as it may change in future
|
||||
versions; in particular, they must maintain their own notion of which buffer
|
||||
is "next" so that if both buffers are owned by userspace, it can process them
|
||||
in the correct order.
|
||||
In the current implementation, the kernel may assign zero, one, or both
|
||||
buffers to the user process; however, an earlier implementation maintained
|
||||
the invariant that at most one buffer could be assigned to the user process
|
||||
at a time.
|
||||
In order to both ensure progress and high performance, user processes should
|
||||
acknowledge a completely processed buffer as quickly as possible, returning
|
||||
it for reuse, and not block waiting on a second buffer while holding another
|
||||
buffer.
|
||||
.Sh IOCTLS
|
||||
The
|
||||
.Xr ioctl 2
|
||||
|
@ -218,6 +218,45 @@ bpf_canfreebuf(struct bpf_d *d)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow the buffer model to indicate that the current store buffer is
|
||||
* immutable, regardless of the appearance of space. Return (1) if the
|
||||
* buffer is writable, and (0) if not.
|
||||
*/
|
||||
static int
|
||||
bpf_canwritebuf(struct bpf_d *d)
|
||||
{
|
||||
|
||||
BPFD_LOCK_ASSERT(d);
|
||||
|
||||
switch (d->bd_bufmode) {
|
||||
case BPF_BUFMODE_ZBUF:
|
||||
return (bpf_zerocopy_canwritebuf(d));
|
||||
}
|
||||
return (1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify buffer model that an attempt to write to the store buffer has
|
||||
* resulted in a dropped packet, in which case the buffer may be considered
|
||||
* full.
|
||||
*/
|
||||
static void
|
||||
bpf_buffull(struct bpf_d *d)
|
||||
{
|
||||
|
||||
BPFD_LOCK_ASSERT(d);
|
||||
|
||||
switch (d->bd_bufmode) {
|
||||
case BPF_BUFMODE_ZBUF:
|
||||
bpf_zerocopy_buffull(d);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify the buffer model that a buffer has moved into the hold position.
|
||||
*/
|
||||
void
|
||||
bpf_bufheld(struct bpf_d *d)
|
||||
{
|
||||
@ -1691,27 +1730,28 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
|
||||
|
||||
/*
|
||||
* Round up the end of the previous packet to the next longword.
|
||||
*
|
||||
* Drop the packet if there's no room and no hope of room
|
||||
* If the packet would overflow the storage buffer or the storage
|
||||
* buffer is considered immutable by the buffer model, try to rotate
|
||||
* the buffer and wakeup pending processes.
|
||||
*/
|
||||
curlen = BPF_WORDALIGN(d->bd_slen);
|
||||
if (curlen + totlen > d->bd_bufsize) {
|
||||
/*
|
||||
* This packet will overflow the storage buffer.
|
||||
* Rotate the buffers if we can, then wakeup any
|
||||
* pending reads.
|
||||
*/
|
||||
if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
|
||||
if (d->bd_fbuf == NULL) {
|
||||
/*
|
||||
* We haven't completed the previous read yet,
|
||||
* so drop the packet.
|
||||
* There's no room in the store buffer, and no
|
||||
* prospect of room, so drop the packet. Notify the
|
||||
* buffer model.
|
||||
*/
|
||||
bpf_buffull(d);
|
||||
++d->bd_dcount;
|
||||
return;
|
||||
}
|
||||
ROTATE_BUFFERS(d);
|
||||
do_wakeup = 1;
|
||||
curlen = 0;
|
||||
}
|
||||
else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
|
||||
} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
|
||||
/*
|
||||
* Immediate mode is set, or the read timeout has already
|
||||
* expired during a select call. A packet arrived, so the
|
||||
|
@ -85,7 +85,7 @@ __FBSDID("$FreeBSD$");
|
||||
* scatter-gather copying. One significant mitigating factor is that on
|
||||
* systems with a direct memory map, we can avoid TLB misses.
|
||||
*
|
||||
* At the front of the shared memor region is a bpf_zbuf_header, which
|
||||
* At the front of the shared memory region is a bpf_zbuf_header, which
|
||||
* contains shared control data to allow user space and the kernel to
|
||||
* synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
|
||||
* knows that the space is not available.
|
||||
@ -94,10 +94,18 @@ struct zbuf {
|
||||
vm_offset_t zb_uaddr; /* User address, may be stale. */
|
||||
size_t zb_size; /* Size of buffer, incl. header. */
|
||||
u_int zb_numpages; /* Number of pages. */
|
||||
int zb_flags; /* Flags on zbuf. */
|
||||
struct sf_buf **zb_pages; /* Pages themselves. */
|
||||
struct bpf_zbuf_header *zb_header; /* Shared header. */
|
||||
};
|
||||
|
||||
/*
|
||||
* When a buffer has been assigned to userspace, flag it as such, as the
|
||||
* buffer may remain in the store position as a result of the user process
|
||||
* not yet having acknowledged the buffer in the hold position yet.
|
||||
*/
|
||||
#define ZBUF_FLAG_IMMUTABLE 0x00000001 /* Set when owned by user. */
|
||||
|
||||
/*
|
||||
* Release a page we've previously wired.
|
||||
*/
|
||||
@ -254,6 +262,9 @@ bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
|
||||
src_bytes = (u_char *)src;
|
||||
zb = (struct zbuf *)buf;
|
||||
|
||||
KASSERT((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0,
|
||||
("bpf_zerocopy_append_bytes: ZBUF_FLAG_IMMUTABLE"));
|
||||
|
||||
/*
|
||||
* Scatter-gather copy to user pages mapped into kernel address space
|
||||
* using sf_bufs: copy up to a page at a time.
|
||||
@ -303,6 +314,9 @@ bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
|
||||
m = (struct mbuf *)src;
|
||||
zb = (struct zbuf *)buf;
|
||||
|
||||
KASSERT((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0,
|
||||
("bpf_zerocopy_append_mbuf: ZBUF_FLAG_IMMUTABLE"));
|
||||
|
||||
/*
|
||||
* Scatter gather both from an mbuf chain and to a user page set
|
||||
* mapped into kernel address space using sf_bufs. If we're lucky,
|
||||
@ -343,10 +357,39 @@ bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Notification from the BPF framework that a buffer in the store position is
|
||||
* rejecting packets and may be considered full. We mark the buffer as
|
||||
* immutable and assign to userspace so that it is immediately available for
|
||||
* the user process to access.
|
||||
*/
|
||||
void
|
||||
bpf_zerocopy_buffull(struct bpf_d *d)
|
||||
{
|
||||
struct zbuf *zb;
|
||||
|
||||
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
|
||||
("bpf_zerocopy_buffull: not in zbuf mode"));
|
||||
|
||||
zb = (struct zbuf *)d->bd_sbuf;
|
||||
KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL"));
|
||||
|
||||
if ((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0) {
|
||||
zb->zb_flags |= ZBUF_FLAG_IMMUTABLE;
|
||||
zb->zb_header->bzh_kernel_len = d->bd_slen;
|
||||
atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Notification from the BPF framework that a buffer has moved into the held
|
||||
* slot on a descriptor. Zero-copy BPF will update the shared page to let
|
||||
* the user process know.
|
||||
* the user process know and flag the buffer as immutable if it hasn't
|
||||
* already been marked immutable due to filling while it was in the store
|
||||
* position.
|
||||
*
|
||||
* Note: identical logic as in bpf_zerocopy_buffull(), except that we operate
|
||||
* on bd_hbuf and bd_hlen.
|
||||
*/
|
||||
void
|
||||
bpf_zerocopy_bufheld(struct bpf_d *d)
|
||||
@ -358,8 +401,12 @@ bpf_zerocopy_bufheld(struct bpf_d *d)
|
||||
|
||||
zb = (struct zbuf *)d->bd_hbuf;
|
||||
KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
|
||||
|
||||
if ((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0) {
|
||||
zb->zb_flags |= ZBUF_FLAG_IMMUTABLE;
|
||||
zb->zb_header->bzh_kernel_len = d->bd_hlen;
|
||||
atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -385,6 +432,28 @@ bpf_zerocopy_canfreebuf(struct bpf_d *d)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Query from the BPF framework as to whether or not the buffer current in
|
||||
* the store position can actually be written to. This may return false if
|
||||
* the store buffer is assigned to userspace before the hold buffer is
|
||||
* acknowledged.
|
||||
*/
|
||||
int
|
||||
bpf_zerocopy_canwritebuf(struct bpf_d *d)
|
||||
{
|
||||
struct zbuf *zb;
|
||||
|
||||
KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
|
||||
("bpf_zerocopy_canwritebuf: not in zbuf mode"));
|
||||
|
||||
zb = (struct zbuf *)d->bd_sbuf;
|
||||
KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL"));
|
||||
|
||||
if (zb->zb_flags & ZBUF_FLAG_IMMUTABLE)
|
||||
return (0);
|
||||
return (1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free zero copy buffers at request of descriptor.
|
||||
*/
|
||||
|
@ -40,8 +40,10 @@ void bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
|
||||
void *src, u_int len);
|
||||
void bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
|
||||
void *src, u_int len);
|
||||
void bpf_zerocopy_buffull(struct bpf_d *);
|
||||
void bpf_zerocopy_bufheld(struct bpf_d *);
|
||||
int bpf_zerocopy_canfreebuf(struct bpf_d *);
|
||||
int bpf_zerocopy_canwritebuf(struct bpf_d *);
|
||||
void bpf_zerocopy_free(struct bpf_d *d);
|
||||
int bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d,
|
||||
size_t *i);
|
||||
|
Loading…
Reference in New Issue
Block a user