bf6c6162c7
Reviewed by: rscheff MFC after: 3 days Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D35503
453 lines
13 KiB
C
453 lines
13 KiB
C
/*-
|
|
* Copyright (c) 2015
|
|
* Jonathan Looney. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* $FreeBSD$
|
|
*/
|
|
|
|
#include <sys/queue.h>
|
|
#include <sys/param.h>
|
|
#include <sys/types.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/socketvar.h>
|
|
#include <sys/sysctl.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/mbuf.h>
|
|
#include <sys/eventhandler.h>
|
|
#include <machine/atomic.h>
|
|
#include <netinet/tcp_var.h>
|
|
#include <netinet/tcp_pcap.h>
|
|
|
|
#define M_LEADINGSPACE_NOWRITE(m) \
|
|
((m)->m_data - M_START(m))
|
|
|
|
int tcp_pcap_aggressive_free = 1;
|
|
static int tcp_pcap_clusters_referenced_cur = 0;
|
|
static int tcp_pcap_clusters_referenced_max = 0;
|
|
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free,
|
|
CTLFLAG_RW, &tcp_pcap_aggressive_free, 0,
|
|
"Free saved packets when the memory system comes under pressure");
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
|
|
CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
|
|
"Number of clusters currently referenced on TCP PCAP queues");
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
|
|
CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
|
|
"Maximum number of clusters allowed to be referenced on TCP PCAP "
|
|
"queues");
|
|
|
|
static int tcp_pcap_alloc_reuse_ext = 0;
|
|
static int tcp_pcap_alloc_reuse_mbuf = 0;
|
|
static int tcp_pcap_alloc_new_mbuf = 0;
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
|
|
CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
|
|
"Number of mbufs with external storage reused for the TCP PCAP "
|
|
"functionality");
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
|
|
CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
|
|
"Number of mbufs with internal storage reused for the TCP PCAP "
|
|
"functionality");
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
|
|
CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
|
|
"Number of new mbufs allocated for the TCP PCAP functionality");
|
|
|
|
VNET_DEFINE(int, tcp_pcap_packets) = 0;
|
|
#define V_tcp_pcap_packets VNET(tcp_pcap_packets)
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets,
|
|
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0,
|
|
"Default number of packets saved per direction per TCPCB");
|
|
|
|
/* Initialize the values. */
|
|
static void
|
|
tcp_pcap_max_set(void)
|
|
{
|
|
|
|
tcp_pcap_clusters_referenced_max = nmbclusters / 4;
|
|
}
|
|
|
|
void
|
|
tcp_pcap_init(void)
|
|
{
|
|
|
|
tcp_pcap_max_set();
|
|
EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
|
|
NULL, EVENTHANDLER_PRI_ANY);
|
|
}
|
|
|
|
/*
|
|
* If we are below the maximum allowed cluster references,
|
|
* increment the reference count and return TRUE. Otherwise,
|
|
* leave the reference count alone and return FALSE.
|
|
*/
|
|
static __inline bool
|
|
tcp_pcap_take_cluster_reference(void)
|
|
{
|
|
if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
|
|
tcp_pcap_clusters_referenced_max) {
|
|
atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
|
|
return FALSE;
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
/*
|
|
* For all the external entries in m, apply the given adjustment.
|
|
* This can be used to adjust the counter when an mbuf chain is
|
|
* copied or freed.
|
|
*/
|
|
static __inline void
|
|
tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
|
|
{
|
|
while (m) {
|
|
if (m->m_flags & M_EXT)
|
|
atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
|
|
|
|
m = m->m_next;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Free all mbufs in a chain, decrementing the reference count as
|
|
* necessary.
|
|
*
|
|
* Functions in this file should use this instead of m_freem() when
|
|
* they are freeing mbuf chains that may contain clusters that were
|
|
* already included in tcp_pcap_clusters_referenced_cur.
|
|
*/
|
|
static void
|
|
tcp_pcap_m_freem(struct mbuf *mb)
|
|
{
|
|
while (mb != NULL) {
|
|
if (mb->m_flags & M_EXT)
|
|
atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
|
|
1);
|
|
mb = m_free(mb);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Copy data from m to n, where n cannot fit all the data we might
|
|
* want from m.
|
|
*
|
|
* Prioritize data like this:
|
|
* 1. TCP header
|
|
* 2. IP header
|
|
* 3. Data
|
|
*/
|
|
static void
|
|
tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
|
|
{
|
|
struct mbuf *m_cur = m;
|
|
int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
|
|
|
|
/* Below, we assume these will be non-NULL. */
|
|
KASSERT(th, ("%s: called with th == NULL", __func__));
|
|
KASSERT(m, ("%s: called with m == NULL", __func__));
|
|
KASSERT(n, ("%s: called with n == NULL", __func__));
|
|
|
|
/* We assume this initialization occurred elsewhere. */
|
|
KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
|
|
__func__, n->m_len));
|
|
KASSERT(n->m_data == M_START(n),
|
|
("%s: called with n->m_data != M_START(n)", __func__));
|
|
|
|
/*
|
|
* Calculate the size of the TCP header. We use this often
|
|
* enough that it is worth just calculating at the start.
|
|
*/
|
|
tcp_off = th->th_off << 2;
|
|
|
|
/* Trim off leading empty mbufs. */
|
|
while (m && m->m_len == 0)
|
|
m = m->m_next;
|
|
|
|
if (m) {
|
|
m_cur = m;
|
|
}
|
|
else {
|
|
/*
|
|
* No data? Highly unusual. We would expect to at
|
|
* least see a TCP header in the mbuf.
|
|
* As we have a pointer to the TCP header, I guess
|
|
* we should just copy that. (???)
|
|
*/
|
|
fallback:
|
|
bytes_to_copy = tcp_off;
|
|
if (bytes_to_copy > M_SIZE(n))
|
|
bytes_to_copy = M_SIZE(n);
|
|
bcopy(th, n->m_data, bytes_to_copy);
|
|
n->m_len = bytes_to_copy;
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Find TCP header. Record the total number of bytes up to,
|
|
* and including, the TCP header.
|
|
*/
|
|
while (m_cur) {
|
|
if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
|
|
(caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
|
|
break;
|
|
bytes_to_copy += m_cur->m_len;
|
|
m_cur = m_cur->m_next;
|
|
}
|
|
if (m_cur)
|
|
bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
|
|
else
|
|
goto fallback;
|
|
bytes_to_copy += tcp_off;
|
|
|
|
/*
|
|
* If we already want to copy more bytes than we can hold
|
|
* in the destination mbuf, skip leading bytes and copy
|
|
* what we can.
|
|
*
|
|
* Otherwise, consider trailing data.
|
|
*/
|
|
if (bytes_to_copy > M_SIZE(n)) {
|
|
skip = bytes_to_copy - M_SIZE(n);
|
|
bytes_to_copy = M_SIZE(n);
|
|
}
|
|
else {
|
|
/*
|
|
* Determine how much trailing data is in the chain.
|
|
* We start with the length of this mbuf (the one
|
|
* containing th) and subtract the size of the TCP
|
|
* header (tcp_off) and the size of the data prior
|
|
* to th (th - m_cur->m_data).
|
|
*
|
|
* This *should not* be negative, as the TCP code
|
|
* should put the whole TCP header in a single
|
|
* mbuf. But, it isn't a problem if it is. We will
|
|
* simple work off our negative balance as we look
|
|
* at subsequent mbufs.
|
|
*/
|
|
trailing_data = m_cur->m_len - tcp_off;
|
|
trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
|
|
m_cur = m_cur->m_next;
|
|
while (m_cur) {
|
|
trailing_data += m_cur->m_len;
|
|
m_cur = m_cur->m_next;
|
|
}
|
|
if ((bytes_to_copy + trailing_data) > M_SIZE(n))
|
|
bytes_to_copy = M_SIZE(n);
|
|
else
|
|
bytes_to_copy += trailing_data;
|
|
}
|
|
|
|
m_copydata(m, skip, bytes_to_copy, n->m_data);
|
|
n->m_len = bytes_to_copy;
|
|
}
|
|
|
|
void
|
|
tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
|
|
{
|
|
struct mbuf *n = NULL, *mhead;
|
|
|
|
KASSERT(th, ("%s: called with th == NULL", __func__));
|
|
KASSERT(m, ("%s: called with m == NULL", __func__));
|
|
KASSERT(queue, ("%s: called with queue == NULL", __func__));
|
|
|
|
/* We only care about data packets. */
|
|
while (m && m->m_type != MT_DATA)
|
|
m = m->m_next;
|
|
|
|
/* We only need to do something if we still have an mbuf. */
|
|
if (!m)
|
|
return;
|
|
|
|
/* If we are not saving mbufs, return now. */
|
|
if (queue->mq_maxlen == 0)
|
|
return;
|
|
|
|
/*
|
|
* Check to see if we will need to recycle mbufs.
|
|
*
|
|
* If we need to get rid of mbufs to stay below
|
|
* our packet count, try to reuse the mbuf. Once
|
|
* we already have a new mbuf (n), then we can
|
|
* simply free subsequent mbufs.
|
|
*
|
|
* Note that most of the logic in here is to deal
|
|
* with the reuse. If we are fine with constant
|
|
* mbuf allocs/deallocs, we could ditch this logic.
|
|
* But, it only seems to make sense to reuse
|
|
* mbufs we already have.
|
|
*/
|
|
while (mbufq_full(queue)) {
|
|
mhead = mbufq_dequeue(queue);
|
|
|
|
if (n) {
|
|
tcp_pcap_m_freem(mhead);
|
|
}
|
|
else {
|
|
/*
|
|
* If this held an external cluster, try to
|
|
* detach the cluster. But, if we held the
|
|
* last reference, go through the normal
|
|
* free-ing process.
|
|
*/
|
|
if (mhead->m_flags & M_EXTPG) {
|
|
/* Don't mess around with these. */
|
|
tcp_pcap_m_freem(mhead);
|
|
continue;
|
|
} else if (mhead->m_flags & M_EXT) {
|
|
switch (mhead->m_ext.ext_type) {
|
|
case EXT_SFBUF:
|
|
/* Don't mess around with these. */
|
|
tcp_pcap_m_freem(mhead);
|
|
continue;
|
|
default:
|
|
if (atomic_fetchadd_int(
|
|
mhead->m_ext.ext_cnt, -1) == 1)
|
|
{
|
|
/*
|
|
* We held the last reference
|
|
* on this cluster. Restore
|
|
* the reference count and put
|
|
* it back in the pool.
|
|
*/
|
|
*(mhead->m_ext.ext_cnt) = 1;
|
|
tcp_pcap_m_freem(mhead);
|
|
continue;
|
|
}
|
|
/*
|
|
* We were able to cleanly free the
|
|
* reference.
|
|
*/
|
|
atomic_subtract_int(
|
|
&tcp_pcap_clusters_referenced_cur,
|
|
1);
|
|
tcp_pcap_alloc_reuse_ext++;
|
|
break;
|
|
}
|
|
} else {
|
|
tcp_pcap_alloc_reuse_mbuf++;
|
|
}
|
|
|
|
n = mhead;
|
|
tcp_pcap_m_freem(n->m_next);
|
|
m_init(n, M_NOWAIT, MT_DATA, 0);
|
|
}
|
|
}
|
|
|
|
/* Check to see if we need to get a new mbuf. */
|
|
if (!n) {
|
|
if (!(n = m_get(M_NOWAIT, MT_DATA)))
|
|
return;
|
|
tcp_pcap_alloc_new_mbuf++;
|
|
}
|
|
|
|
/*
|
|
* What are we dealing with? If a cluster, attach it. Otherwise,
|
|
* try to copy the data from the beginning of the mbuf to the
|
|
* end of data. (There may be data between the start of the data
|
|
* area and the current data pointer. We want to get this, because
|
|
* it may contain header information that is useful.)
|
|
* In cases where that isn't possible, settle for what we can
|
|
* get.
|
|
*/
|
|
if ((m->m_flags & (M_EXT|M_EXTPG)) &&
|
|
tcp_pcap_take_cluster_reference()) {
|
|
n->m_data = m->m_data;
|
|
n->m_len = m->m_len;
|
|
mb_dupcl(n, m);
|
|
}
|
|
else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
|
|
/*
|
|
* At this point, n is guaranteed to be a normal mbuf
|
|
* with no cluster and no packet header. Because the
|
|
* logic in this code block requires this, the assert
|
|
* is here to catch any instances where someone
|
|
* changes the logic to invalidate that assumption.
|
|
*/
|
|
KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
|
|
("%s: Unexpected flags (%#x) for mbuf",
|
|
__func__, n->m_flags));
|
|
n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
|
|
n->m_len = m->m_len;
|
|
if (m->m_flags & M_EXTPG)
|
|
m_copydata(m, 0, m->m_len, n->m_data);
|
|
else
|
|
bcopy(M_START(m), n->m_dat,
|
|
m->m_len + M_LEADINGSPACE_NOWRITE(m));
|
|
}
|
|
else {
|
|
/*
|
|
* This is the case where we need to "settle for what
|
|
* we can get". The most probable way to this code
|
|
* path is that we've already taken references to the
|
|
* maximum number of mbuf clusters we can, and the data
|
|
* is too long to fit in an mbuf's internal storage.
|
|
* Try for a "best fit".
|
|
*/
|
|
tcp_pcap_copy_bestfit(th, m, n);
|
|
|
|
/* Don't try to get additional data. */
|
|
goto add_to_queue;
|
|
}
|
|
|
|
if (m->m_next) {
|
|
n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
|
|
tcp_pcap_adj_cluster_reference(n->m_next, 1);
|
|
}
|
|
|
|
add_to_queue:
|
|
/* Add the new mbuf to the list. */
|
|
if (mbufq_enqueue(queue, n)) {
|
|
/* This shouldn't happen. If INVARIANTS is defined, panic. */
|
|
KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
|
|
tcp_pcap_m_freem(n);
|
|
}
|
|
}
|
|
|
|
void
|
|
tcp_pcap_drain(struct mbufq *queue)
|
|
{
|
|
struct mbuf *m;
|
|
while ((m = mbufq_dequeue(queue)))
|
|
tcp_pcap_m_freem(m);
|
|
}
|
|
|
|
void
|
|
tcp_pcap_tcpcb_init(struct tcpcb *tp)
|
|
{
|
|
mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
|
|
mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
|
|
}
|
|
|
|
void
|
|
tcp_pcap_set_sock_max(struct mbufq *queue, int newval)
|
|
{
|
|
queue->mq_maxlen = newval;
|
|
while (queue->mq_len > queue->mq_maxlen)
|
|
tcp_pcap_m_freem(mbufq_dequeue(queue));
|
|
}
|
|
|
|
int
|
|
tcp_pcap_get_sock_max(struct mbufq *queue)
|
|
{
|
|
return queue->mq_maxlen;
|
|
}
|