- Improve performace for writer-only BPF users.
Linux and Solaris (at least OpenSolaris) has PF_PACKET socket families to send raw ethernet frames. The only FreeBSD interface that can be used to send raw frames is BPF. As a result, many programs like cdpd, lldpd, various dhcp stuff uses BPF only to send data. This leads us to the situation when software like cdpd, being run on high-traffic-volume interface significantly reduces overall performance since we have to acquire additional locks for every packet. Here we add sysctl that changes BPF behavior in the following way: If program came and opens BPF socket without explicitly specifyin read filter we assume it to be write-only and add it to special writer-only per-interface list. This makes bpf_peers_present() return 0, so no additional overhead is introduced. After filter is supplied, descriptor is added to original per-interface list permitting packets to be captured. Unfortunately, pcap_open_live() sets catch-all filter itself for the purpose of setting snap length. Fortunately, most programs explicitly sets (event catch-all) filter after that. tcpdump(1) is a good example. So a bit hackis approach is taken: we upgrade description only after second BIOCSETF is received. Sysctl is named net.bpf.optimize_writers and is turned off by default. - While here, document all sysctl variables in bpf.4 Sponsored by Yandex LLC Reviewed by: glebius (previous version) Reviewed by: silence on -net@ Approved by: (mentor) MFC after: 4 weeks
This commit is contained in:
parent
8b1d10268c
commit
85ccef88d3
@ -952,10 +952,33 @@ array initializers:
|
||||
.Fn BPF_STMT opcode operand
|
||||
and
|
||||
.Fn BPF_JUMP opcode operand true_offset false_offset .
|
||||
.Sh FILES
|
||||
.Bl -tag -compact -width /dev/bpf
|
||||
.It Pa /dev/bpf
|
||||
the packet filter device
|
||||
.Sh SYSCTL VARIABLES
|
||||
A set of
|
||||
.Xr sysctl 8
|
||||
variables controls the behaviour of the
|
||||
.Nm
|
||||
subsystem
|
||||
.Bl -tag -width indent
|
||||
.It Va net.bpf.optimize_writers: No 0
|
||||
Various programs use BPF to send (but not receive) raw packets
|
||||
(cdpd, lldpd, dhcpd, dhcp relays, etc. are good examples of such programs).
|
||||
They do not need incoming packets to be send to them. Turning this option on
|
||||
makes new BPF users to be attached to write-only interface list until program
|
||||
explicitly specifies read filter via
|
||||
.Cm pcap_set_filter() .
|
||||
This removes any performance degradation for high-speed interfaces.
|
||||
.It Va net.bpf.stats:
|
||||
Binary interface for retrieving general statistics.
|
||||
.It Va net.bpf.zerocopy_enable: No 0
|
||||
Permits zero-copy to be used with net BPF readers. Use with caution.
|
||||
.It Va net.bpf.maxinsns: No 512
|
||||
Maximum number of instructions that BPF program can contain. Use
|
||||
.Xr tcpdump 1
|
||||
-d option to determine approximate number of instruction for any filter.
|
||||
.It Va net.bpf.maxbufsize: No 524288
|
||||
Maximum buffer size to allocate for packets buffer.
|
||||
.It Va net.bpf.bufsize: No 4096
|
||||
Default buffer size to allocate for packets buffer.
|
||||
.El
|
||||
.Sh EXAMPLES
|
||||
The following filter is taken from the Reverse ARP Daemon.
|
||||
|
@ -176,6 +176,12 @@ SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
|
||||
static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
|
||||
bpf_stats_sysctl, "bpf statistics portal");
|
||||
|
||||
static VNET_DEFINE(int, bpf_optimize_writers) = 0;
|
||||
#define V_bpf_optimize_writers VNET(bpf_optimize_writers)
|
||||
SYSCTL_VNET_INT(_net_bpf, OID_AUTO, optimize_writers,
|
||||
CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0,
|
||||
"Do not send packets until BPF program is set");
|
||||
|
||||
static d_open_t bpfopen;
|
||||
static d_read_t bpfread;
|
||||
static d_write_t bpfwrite;
|
||||
@ -572,17 +578,66 @@ static void
|
||||
bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
|
||||
{
|
||||
/*
|
||||
* Point d at bp, and add d to the interface's list of listeners.
|
||||
* Finally, point the driver's bpf cookie at the interface so
|
||||
* it will divert packets to bpf.
|
||||
* Point d at bp, and add d to the interface's list.
|
||||
* Since there are many applicaiotns using BPF for
|
||||
* sending raw packets only (dhcpd, cdpd are good examples)
|
||||
* we can delay adding d to the list of active listeners until
|
||||
* some filter is configured.
|
||||
*/
|
||||
BPFIF_WLOCK(bp);
|
||||
d->bd_bif = bp;
|
||||
LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
|
||||
|
||||
bpf_bpfd_cnt++;
|
||||
BPFIF_WLOCK(bp);
|
||||
|
||||
if (V_bpf_optimize_writers != 0) {
|
||||
/* Add to writers-only list */
|
||||
LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
|
||||
/*
|
||||
* We decrement bd_writer on every filter set operation.
|
||||
* First BIOCSETF is done by pcap_open_live() to set up
|
||||
* snap length. After that appliation usually sets its own filter
|
||||
*/
|
||||
d->bd_writer = 2;
|
||||
} else
|
||||
LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
|
||||
|
||||
BPFIF_WUNLOCK(bp);
|
||||
|
||||
BPF_LOCK();
|
||||
bpf_bpfd_cnt++;
|
||||
BPF_UNLOCK();
|
||||
|
||||
CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
|
||||
__func__, d->bd_pid, d->bd_writer ? "writer" : "active");
|
||||
|
||||
if (V_bpf_optimize_writers == 0)
|
||||
EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add d to the list of active bp filters.
|
||||
* Reuqires bpf_attachd() to be called before
|
||||
*/
|
||||
static void
|
||||
bpf_upgraded(struct bpf_d *d)
|
||||
{
|
||||
struct bpf_if *bp;
|
||||
|
||||
bp = d->bd_bif;
|
||||
|
||||
BPFIF_WLOCK(bp);
|
||||
BPFD_WLOCK(d);
|
||||
|
||||
/* Remove from writers-only list */
|
||||
LIST_REMOVE(d, bd_next);
|
||||
LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
|
||||
/* Mark d as reader */
|
||||
d->bd_writer = 0;
|
||||
|
||||
BPFD_WUNLOCK(d);
|
||||
BPFIF_WUNLOCK(bp);
|
||||
|
||||
CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
|
||||
|
||||
EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
|
||||
}
|
||||
|
||||
@ -596,12 +651,17 @@ bpf_detachd(struct bpf_d *d)
|
||||
struct bpf_if *bp;
|
||||
struct ifnet *ifp;
|
||||
|
||||
CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
|
||||
|
||||
BPF_LOCK_ASSERT();
|
||||
|
||||
bp = d->bd_bif;
|
||||
BPFIF_WLOCK(bp);
|
||||
BPFD_WLOCK(d);
|
||||
|
||||
/* Save bd_writer value */
|
||||
error = d->bd_writer;
|
||||
|
||||
/*
|
||||
* Remove d from the interface's descriptor list.
|
||||
*/
|
||||
@ -615,7 +675,9 @@ bpf_detachd(struct bpf_d *d)
|
||||
/* We're already protected by global lock. */
|
||||
bpf_bpfd_cnt--;
|
||||
|
||||
EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
|
||||
/* Call event handler iff d is attached */
|
||||
if (error == 0)
|
||||
EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
|
||||
|
||||
/*
|
||||
* Check if this descriptor had requested promiscuous mode.
|
||||
@ -1536,6 +1598,7 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
|
||||
#ifdef COMPAT_FREEBSD32
|
||||
struct bpf_program32 *fp32;
|
||||
struct bpf_program fp_swab;
|
||||
int need_upgrade = 0;
|
||||
|
||||
if (cmd == BIOCSETWF32 || cmd == BIOCSETF32 || cmd == BIOCSETFNR32) {
|
||||
fp32 = (struct bpf_program32 *)fp;
|
||||
@ -1611,6 +1674,16 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
|
||||
#endif
|
||||
if (cmd == BIOCSETF)
|
||||
reset_d(d);
|
||||
|
||||
/*
|
||||
* Do not require upgrade by first BIOCSETF
|
||||
* (used to set snaplen) by pcap_open_live()
|
||||
*/
|
||||
if ((d->bd_writer != 0) && (--d->bd_writer == 0))
|
||||
need_upgrade = 1;
|
||||
CTR4(KTR_NET, "%s: filter function set by pid %d, "
|
||||
"bd_writer counter %d, need_upgrade %d",
|
||||
__func__, d->bd_pid, d->bd_writer, need_upgrade);
|
||||
}
|
||||
BPFD_WUNLOCK(d);
|
||||
BPFIF_WUNLOCK(d->bd_bif);
|
||||
@ -1621,6 +1694,10 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
|
||||
bpf_destroy_jit_filter(ofunc);
|
||||
#endif
|
||||
|
||||
/* Move d to active readers list */
|
||||
if (need_upgrade != 0)
|
||||
bpf_upgraded(d);
|
||||
|
||||
return (0);
|
||||
}
|
||||
free((caddr_t)fcode, M_BPF);
|
||||
@ -2265,6 +2342,7 @@ bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
|
||||
panic("bpfattach");
|
||||
|
||||
LIST_INIT(&bp->bif_dlist);
|
||||
LIST_INIT(&bp->bif_wlist);
|
||||
bp->bif_ifp = ifp;
|
||||
bp->bif_dlt = dlt;
|
||||
rw_init(&bp->bif_lock, "bpf interface lock");
|
||||
@ -2520,6 +2598,13 @@ bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
|
||||
index = 0;
|
||||
LIST_FOREACH(bp, &bpf_iflist, bif_next) {
|
||||
BPFIF_RLOCK(bp);
|
||||
/* Send writers-only first */
|
||||
LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
|
||||
xbd = &xbdbuf[index++];
|
||||
BPFD_RLOCK(bd);
|
||||
bpfstats_fill_xbpf(xbd, bd);
|
||||
BPFD_RUNLOCK(bd);
|
||||
}
|
||||
LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
|
||||
xbd = &xbdbuf[index++];
|
||||
BPFD_RLOCK(bd);
|
||||
|
@ -1104,6 +1104,7 @@ struct bpf_if {
|
||||
u_int bif_hdrlen; /* length of link header */
|
||||
struct ifnet *bif_ifp; /* corresponding interface */
|
||||
struct rwlock bif_lock; /* interface lock */
|
||||
LIST_HEAD(, bpf_d) bif_wlist; /* writer-only list */
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -79,6 +79,7 @@ struct bpf_d {
|
||||
u_char bd_promisc; /* true if listening promiscuously */
|
||||
u_char bd_state; /* idle, waiting, or timed out */
|
||||
u_char bd_immediate; /* true to return on packet arrival */
|
||||
u_char bd_writer; /* non-zero if d is writer-only */
|
||||
int bd_hdrcmplt; /* false to fill in src lladdr automatically */
|
||||
int bd_direction; /* select packet direction */
|
||||
int bd_tstamp; /* select time stamping function */
|
||||
|
Loading…
x
Reference in New Issue
Block a user