Maintain and observe a ZBUF_FLAG_IMMUTABLE flag on zero-copy BPF

buffer kernel descriptors, which is used to allow the buffer currently in the BPF "store" position to be assigned to userspace when it fills, even if userspace hasn't acknowledged the buffer in the "hold" position yet. To implement this, notify the buffer model when a buffer becomes full, and check that the store buffer is writable, not just for it being full, before trying to append new packet data. Shared memory buffers will be assigned to userspace at most once per fill, be it in the store or in the hold position. This removes the restriction that at most one shared memory can by owned by userspace, reducing the chances that userspace will need to call select() after acknowledging one buffer in order to wait for the next buffer when under high load. This more fully realizes the goal of zero system calls in order to process a high-speed packet stream from BPF. Update bpf.4 to reflect that both buffers may be owned by userspace at once; caution against assuming this.
svn path=/head/; revision=177966
2008-04-07 02:51:00 +00:00 · 2008-04-07 02:51:00 +00:00 · a7a91e6592 · 2020-12-20 02:59:44 +00:00
commit a7a91e6592
parent 08304c1617
4 changed files with 133 additions and 22 deletions
--- a/share/man/man4/bpf.4
+++ b/share/man/man4/bpf.4
@ -259,14 +259,14 @@ may be used to sleep awaiting the availbility of a completed buffer.
 They will return a readable file descriptor when ownership of the next buffer
 is assigned to user space.
 .Pp
-In the current implementation, the kernel will assign ownership of at most
-one buffer at a time to the user process.
-The user processes must acknowledge the current buffer in order to be
-notified that the next buffer is ready for processing.
-Programs should not rely on this as an invariant, as it may change in future
-versions; in particular, they must maintain their own notion of which buffer
-is "next" so that if both buffers are owned by userspace, it can process them
-in the correct order.
+In the current implementation, the kernel may assign zero, one, or both
+buffers to the user process; however, an earlier implementation maintained
+the invariant that at most one buffer could be assigned to the user process
+at a time.
+In order to both ensure progress and high performance, user processes should
+acknowledge a completely processed buffer as quickly as possible, returning
+it for reuse, and not block waiting on a second buffer while holding another
+buffer.
 .Sh IOCTLS
 The
 .Xr ioctl 2
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@ -218,6 +218,45 @@ bpf_canfreebuf(struct bpf_d *d)
 	return (0);
 }

+/*
+ * Allow the buffer model to indicate that the current store buffer is
+ * immutable, regardless of the appearance of space.  Return (1) if the
+ * buffer is writable, and (0) if not.
+ */
+static int
+bpf_canwritebuf(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_canwritebuf(d));
+	}
+	return (1);
+}
+
+/*
+ * Notify buffer model that an attempt to write to the store buffer has
+ * resulted in a dropped packet, in which case the buffer may be considered
+ * full.
+ */
+static void
+bpf_buffull(struct bpf_d *d)
+{
+
+	BPFD_LOCK_ASSERT(d);
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_ZBUF:
+		bpf_zerocopy_buffull(d);
+		break;
+	}
+}
+
+/*
+ * Notify the buffer model that a buffer has moved into the hold position.
+ */
 void
 bpf_bufheld(struct bpf_d *d)
 {
@ -1691,27 +1730,28 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,

 	/*
 	 * Round up the end of the previous packet to the next longword.
+	 *
+	 * Drop the packet if there's no room and no hope of room
+	 * If the packet would overflow the storage buffer or the storage
+	 * buffer is considered immutable by the buffer model, try to rotate
+	 * the buffer and wakeup pending processes.
 	 */
 	curlen = BPF_WORDALIGN(d->bd_slen);
-	if (curlen + totlen > d->bd_bufsize) {
-		/*
-		 * This packet will overflow the storage buffer.
-		 * Rotate the buffers if we can, then wakeup any
-		 * pending reads.
-		 */
+	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
 		if (d->bd_fbuf == NULL) {
 			/*
-			 * We haven't completed the previous read yet,
-			 * so drop the packet.
+			 * There's no room in the store buffer, and no
+			 * prospect of room, so drop the packet.  Notify the
+			 * buffer model.
 			 */
+			bpf_buffull(d);
 			++d->bd_dcount;
 			return;
 		}
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
-	}
-	else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
+	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
 		 * Immediate mode is set, or the read timeout has already
 		 * expired during a select call.  A packet arrived, so the
--- a/sys/net/bpf_zerocopy.c
+++ b/sys/net/bpf_zerocopy.c
@ -85,7 +85,7 @@ __FBSDID("$FreeBSD$");
 * scatter-gather copying.  One significant mitigating factor is that on
 * systems with a direct memory map, we can avoid TLB misses.
 *
- * At the front of the shared memor region is a bpf_zbuf_header, which
+ * At the front of the shared memory region is a bpf_zbuf_header, which
 * contains shared control data to allow user space and the kernel to
 * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
 * knows that the space is not available.
@ -94,10 +94,18 @@ struct zbuf {
 	vm_offset_t	 zb_uaddr;	/* User address, may be stale. */
 	size_t		 zb_size;	/* Size of buffer, incl. header. */
 	u_int		 zb_numpages;	/* Number of pages. */
+	int		 zb_flags;	/* Flags on zbuf. */
 	struct sf_buf	**zb_pages;	/* Pages themselves. */
 	struct bpf_zbuf_header	*zb_header;	/* Shared header. */
 };

+/*
+ * When a buffer has been assigned to userspace, flag it as such, as the
+ * buffer may remain in the store position as a result of the user process
+ * not yet having acknowledged the buffer in the hold position yet.
+ */
+#define	ZBUF_FLAG_IMMUTABLE	0x00000001	/* Set when owned by user. */
+
 /*
 * Release a page we've previously wired.
 */
@ -254,6 +262,9 @@ bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
 	src_bytes = (u_char *)src;
 	zb = (struct zbuf *)buf;

+	KASSERT((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0,
+	    ("bpf_zerocopy_append_bytes: ZBUF_FLAG_IMMUTABLE"));
+
 	/*
 	 * Scatter-gather copy to user pages mapped into kernel address space
 	 * using sf_bufs: copy up to a page at a time.
@ -303,6 +314,9 @@ bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
 	m = (struct mbuf *)src;
 	zb = (struct zbuf *)buf;

+	KASSERT((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0,
+	    ("bpf_zerocopy_append_mbuf: ZBUF_FLAG_IMMUTABLE"));
+
 	/*
 	 * Scatter gather both from an mbuf chain and to a user page set
 	 * mapped into kernel address space using sf_bufs.  If we're lucky,
@ -343,10 +357,39 @@ bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
 	}
 }

+/*
+ * Notification from the BPF framework that a buffer in the store position is
+ * rejecting packets and may be considered full.  We mark the buffer as
+ * immutable and assign to userspace so that it is immediately available for
+ * the user process to access.
+ */
+void
+bpf_zerocopy_buffull(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_buffull: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_sbuf;
+	KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL"));
+
+	if ((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0) {
+		zb->zb_flags |= ZBUF_FLAG_IMMUTABLE;
+		zb->zb_header->bzh_kernel_len = d->bd_slen;
+		atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
+	}
+}
+
 /*
 * Notification from the BPF framework that a buffer has moved into the held
 * slot on a descriptor.  Zero-copy BPF will update the shared page to let
- * the user process know.
+ * the user process know and flag the buffer as immutable if it hasn't
+ * already been marked immutable due to filling while it was in the store
+ * position.
+ *
+ * Note: identical logic as in bpf_zerocopy_buffull(), except that we operate
+ * on bd_hbuf and bd_hlen.
 */
 void
 bpf_zerocopy_bufheld(struct bpf_d *d)
@ -358,8 +401,12 @@ bpf_zerocopy_bufheld(struct bpf_d *d)

 	zb = (struct zbuf *)d->bd_hbuf;
 	KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
-	zb->zb_header->bzh_kernel_len = d->bd_hlen;
-	atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
+
+	if ((zb->zb_flags & ZBUF_FLAG_IMMUTABLE) == 0) {
+		zb->zb_flags |= ZBUF_FLAG_IMMUTABLE;
+		zb->zb_header->bzh_kernel_len = d->bd_hlen;
+		atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
+	}
 }

 /*
@ -385,6 +432,28 @@ bpf_zerocopy_canfreebuf(struct bpf_d *d)
 	return (0);
 }

+/*
+ * Query from the BPF framework as to whether or not the buffer current in
+ * the store position can actually be written to.  This may return false if
+ * the store buffer is assigned to userspace before the hold buffer is
+ * acknowledged.
+ */
+int
+bpf_zerocopy_canwritebuf(struct bpf_d *d)
+{
+	struct zbuf *zb;
+
+	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+	    ("bpf_zerocopy_canwritebuf: not in zbuf mode"));
+
+	zb = (struct zbuf *)d->bd_sbuf;
+	KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL"));
+
+	if (zb->zb_flags & ZBUF_FLAG_IMMUTABLE)
+		return (0);
+	return (1);
+}
+
 /*
 * Free zero copy buffers at request of descriptor.
 */
--- a/sys/net/bpf_zerocopy.h
+++ b/sys/net/bpf_zerocopy.h
@ -40,8 +40,10 @@ void	bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
 	    void *src, u_int len);
 void	bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
 	    void *src, u_int len);
+void	bpf_zerocopy_buffull(struct bpf_d *);
 void	bpf_zerocopy_bufheld(struct bpf_d *);
 int	bpf_zerocopy_canfreebuf(struct bpf_d *);
+int	bpf_zerocopy_canwritebuf(struct bpf_d *);
 void	bpf_zerocopy_free(struct bpf_d *d);
 int	bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d,
 	    size_t *i);