- Improve performace for writer-only BPF users.

Linux and Solaris (at least OpenSolaris) has PF_PACKET socket families to send
raw ethernet frames. The only FreeBSD interface that can be used to send raw frames
is BPF. As a result, many programs like cdpd, lldpd, various dhcp stuff uses
BPF only to send data. This leads us to the situation when software like cdpd,
being run on high-traffic-volume interface significantly reduces overall performance
since we have to acquire additional locks for every packet.

Here we add sysctl that changes BPF behavior in the following way:
If program came and opens BPF socket without explicitly specifyin read filter we
assume it to be write-only and add it to special writer-only per-interface list.
This makes bpf_peers_present() return 0, so no additional overhead is introduced.
After filter is supplied, descriptor is added to original per-interface list permitting
packets to be captured.

Unfortunately, pcap_open_live() sets catch-all filter itself for the purpose of
setting snap length.

Fortunately, most programs explicitly sets (event catch-all) filter after that.
tcpdump(1) is a good example.

So a bit hackis approach is taken: we upgrade description only after second
BIOCSETF is received.

Sysctl is named net.bpf.optimize_writers and is turned off by default.

- While here, document all sysctl variables in bpf.4

Sponsored by Yandex LLC

Reviewed by:    glebius (previous version)
Reviewed by:    silence on -net@
Approved by:    (mentor)

MFC after:      4 weeks
This commit is contained in:
Alexander V. Chernikov 2012-04-06 06:55:21 +00:00
parent e4b3229aa5
commit 51ec1eb70d
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=233938
4 changed files with 121 additions and 11 deletions

View File

@ -952,10 +952,33 @@ array initializers:
.Fn BPF_STMT opcode operand
and
.Fn BPF_JUMP opcode operand true_offset false_offset .
.Sh FILES
.Bl -tag -compact -width /dev/bpf
.It Pa /dev/bpf
the packet filter device
.Sh SYSCTL VARIABLES
A set of
.Xr sysctl 8
variables controls the behaviour of the
.Nm
subsystem
.Bl -tag -width indent
.It Va net.bpf.optimize_writers: No 0
Various programs use BPF to send (but not receive) raw packets
(cdpd, lldpd, dhcpd, dhcp relays, etc. are good examples of such programs).
They do not need incoming packets to be send to them. Turning this option on
makes new BPF users to be attached to write-only interface list until program
explicitly specifies read filter via
.Cm pcap_set_filter() .
This removes any performance degradation for high-speed interfaces.
.It Va net.bpf.stats:
Binary interface for retrieving general statistics.
.It Va net.bpf.zerocopy_enable: No 0
Permits zero-copy to be used with net BPF readers. Use with caution.
.It Va net.bpf.maxinsns: No 512
Maximum number of instructions that BPF program can contain. Use
.Xr tcpdump 1
-d option to determine approximate number of instruction for any filter.
.It Va net.bpf.maxbufsize: No 524288
Maximum buffer size to allocate for packets buffer.
.It Va net.bpf.bufsize: No 4096
Default buffer size to allocate for packets buffer.
.El
.Sh EXAMPLES
The following filter is taken from the Reverse ARP Daemon.

View File

@ -176,6 +176,12 @@ SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
bpf_stats_sysctl, "bpf statistics portal");
static VNET_DEFINE(int, bpf_optimize_writers) = 0;
#define V_bpf_optimize_writers VNET(bpf_optimize_writers)
SYSCTL_VNET_INT(_net_bpf, OID_AUTO, optimize_writers,
CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0,
"Do not send packets until BPF program is set");
static d_open_t bpfopen;
static d_read_t bpfread;
static d_write_t bpfwrite;
@ -572,17 +578,66 @@ static void
bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
{
/*
* Point d at bp, and add d to the interface's list of listeners.
* Finally, point the driver's bpf cookie at the interface so
* it will divert packets to bpf.
* Point d at bp, and add d to the interface's list.
* Since there are many applicaiotns using BPF for
* sending raw packets only (dhcpd, cdpd are good examples)
* we can delay adding d to the list of active listeners until
* some filter is configured.
*/
BPFIF_WLOCK(bp);
d->bd_bif = bp;
LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
bpf_bpfd_cnt++;
BPFIF_WLOCK(bp);
if (V_bpf_optimize_writers != 0) {
/* Add to writers-only list */
LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
/*
* We decrement bd_writer on every filter set operation.
* First BIOCSETF is done by pcap_open_live() to set up
* snap length. After that appliation usually sets its own filter
*/
d->bd_writer = 2;
} else
LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
BPFIF_WUNLOCK(bp);
BPF_LOCK();
bpf_bpfd_cnt++;
BPF_UNLOCK();
CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
__func__, d->bd_pid, d->bd_writer ? "writer" : "active");
if (V_bpf_optimize_writers == 0)
EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
}
/*
* Add d to the list of active bp filters.
* Reuqires bpf_attachd() to be called before
*/
static void
bpf_upgraded(struct bpf_d *d)
{
struct bpf_if *bp;
bp = d->bd_bif;
BPFIF_WLOCK(bp);
BPFD_WLOCK(d);
/* Remove from writers-only list */
LIST_REMOVE(d, bd_next);
LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
/* Mark d as reader */
d->bd_writer = 0;
BPFD_WUNLOCK(d);
BPFIF_WUNLOCK(bp);
CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
}
@ -596,12 +651,17 @@ bpf_detachd(struct bpf_d *d)
struct bpf_if *bp;
struct ifnet *ifp;
CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
BPF_LOCK_ASSERT();
bp = d->bd_bif;
BPFIF_WLOCK(bp);
BPFD_WLOCK(d);
/* Save bd_writer value */
error = d->bd_writer;
/*
* Remove d from the interface's descriptor list.
*/
@ -615,7 +675,9 @@ bpf_detachd(struct bpf_d *d)
/* We're already protected by global lock. */
bpf_bpfd_cnt--;
EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
/* Call event handler iff d is attached */
if (error == 0)
EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
/*
* Check if this descriptor had requested promiscuous mode.
@ -1536,6 +1598,7 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
#ifdef COMPAT_FREEBSD32
struct bpf_program32 *fp32;
struct bpf_program fp_swab;
int need_upgrade = 0;
if (cmd == BIOCSETWF32 || cmd == BIOCSETF32 || cmd == BIOCSETFNR32) {
fp32 = (struct bpf_program32 *)fp;
@ -1611,6 +1674,16 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
#endif
if (cmd == BIOCSETF)
reset_d(d);
/*
* Do not require upgrade by first BIOCSETF
* (used to set snaplen) by pcap_open_live()
*/
if ((d->bd_writer != 0) && (--d->bd_writer == 0))
need_upgrade = 1;
CTR4(KTR_NET, "%s: filter function set by pid %d, "
"bd_writer counter %d, need_upgrade %d",
__func__, d->bd_pid, d->bd_writer, need_upgrade);
}
BPFD_WUNLOCK(d);
BPFIF_WUNLOCK(d->bd_bif);
@ -1621,6 +1694,10 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
bpf_destroy_jit_filter(ofunc);
#endif
/* Move d to active readers list */
if (need_upgrade != 0)
bpf_upgraded(d);
return (0);
}
free((caddr_t)fcode, M_BPF);
@ -2265,6 +2342,7 @@ bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
panic("bpfattach");
LIST_INIT(&bp->bif_dlist);
LIST_INIT(&bp->bif_wlist);
bp->bif_ifp = ifp;
bp->bif_dlt = dlt;
rw_init(&bp->bif_lock, "bpf interface lock");
@ -2520,6 +2598,13 @@ bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
index = 0;
LIST_FOREACH(bp, &bpf_iflist, bif_next) {
BPFIF_RLOCK(bp);
/* Send writers-only first */
LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
xbd = &xbdbuf[index++];
BPFD_RLOCK(bd);
bpfstats_fill_xbpf(xbd, bd);
BPFD_RUNLOCK(bd);
}
LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
xbd = &xbdbuf[index++];
BPFD_RLOCK(bd);

View File

@ -1104,6 +1104,7 @@ struct bpf_if {
u_int bif_hdrlen; /* length of link header */
struct ifnet *bif_ifp; /* corresponding interface */
struct rwlock bif_lock; /* interface lock */
LIST_HEAD(, bpf_d) bif_wlist; /* writer-only list */
#endif
};

View File

@ -79,6 +79,7 @@ struct bpf_d {
u_char bd_promisc; /* true if listening promiscuously */
u_char bd_state; /* idle, waiting, or timed out */
u_char bd_immediate; /* true to return on packet arrival */
u_char bd_writer; /* non-zero if d is writer-only */
int bd_hdrcmplt; /* false to fill in src lladdr automatically */
int bd_direction; /* select packet direction */
int bd_tstamp; /* select time stamping function */