freebsd-skq/sys/netpfil/ipfw/ip_fw_table_value.c
ae 818d823c95 Remove "IPFW static rules" rmlock.
Make PFIL's lock global and use it for this purpose.
This reduces the number of locks needed to acquire for each packet.

Obtained from:	Yandex LLC
MFC after:	2 weeks
Sponsored by:	Yandex LLC
No objection from: #network
Differential Revision:	https://reviews.freebsd.org/D10154
2017-04-03 13:35:04 +00:00

810 lines
19 KiB
C

/*-
* Copyright (c) 2014 Yandex LLC
* Copyright (c) 2014 Alexander V. Chernikov
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Multi-field value support for ipfw tables.
*
* This file contains necessary functions to convert
* large multi-field values into u32 indices suitable to be fed
* to various table algorithms. Other machinery like proper refcounting,
* internal structures resizing are also kept here.
*/
#include "opt_ipfw.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/hash.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/queue.h>
#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
#include <netinet/ip_fw.h>
#include <netpfil/ipfw/ip_fw_private.h>
#include <netpfil/ipfw/ip_fw_table.h>
static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key,
uint32_t kopt);
static int cmp_table_value(struct named_object *no, const void *key,
uint32_t kopt);
static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
struct sockopt_data *sd);
static struct ipfw_sopt_handler scodes[] = {
{ IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values },
};
#define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)
struct table_val_link
{
struct named_object no;
struct table_value *pval; /* Pointer to real table value */
};
#define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */
struct vdump_args {
struct ip_fw_chain *ch;
struct sockopt_data *sd;
struct table_value *pval;
int error;
};
static uint32_t
hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt)
{
return (hash32_buf(key, 56, 0));
}
static int
cmp_table_value(struct named_object *no, const void *key, uint32_t kopt)
{
return (memcmp(((struct table_val_link *)no)->pval, key, 56));
}
static void
mask_table_value(struct table_value *src, struct table_value *dst,
uint32_t mask)
{
#define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }
memset(dst, 0, sizeof(*dst));
_MCPY(tag, IPFW_VTYPE_TAG);
_MCPY(pipe, IPFW_VTYPE_PIPE);
_MCPY(divert, IPFW_VTYPE_DIVERT);
_MCPY(skipto, IPFW_VTYPE_SKIPTO);
_MCPY(netgraph, IPFW_VTYPE_NETGRAPH);
_MCPY(fib, IPFW_VTYPE_FIB);
_MCPY(nat, IPFW_VTYPE_NAT);
_MCPY(dscp, IPFW_VTYPE_DSCP);
_MCPY(nh4, IPFW_VTYPE_NH4);
_MCPY(nh6, IPFW_VTYPE_NH6);
_MCPY(zoneid, IPFW_VTYPE_NH6);
#undef _MCPY
}
static void
get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,
struct table_value **ptv, struct namedobj_instance **pvi)
{
struct table_value *pval;
struct namedobj_instance *vi;
if (vshared != 0) {
pval = (struct table_value *)ch->valuestate;
vi = CHAIN_TO_VI(ch);
} else {
pval = NULL;
vi = NULL;
//pval = (struct table_value *)&tc->ti.data;
}
if (ptv != NULL)
*ptv = pval;
if (pvi != NULL)
*pvi = vi;
}
/*
* Update pointers to real vaues after @pval change.
*/
static int
update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
{
struct vdump_args *da;
struct table_val_link *ptv;
struct table_value *pval;
da = (struct vdump_args *)arg;
ptv = (struct table_val_link *)no;
pval = da->pval;
ptv->pval = &pval[ptv->no.kidx];
ptv->no.name = (char *)&pval[ptv->no.kidx];
return (0);
}
/*
* Grows value storage shared among all tables.
* Drops/reacquires UH locks.
* Notifies other running adds on @ch shared storage resize.
* Note function does not guarantee that free space
* will be available after invocation, so one caller needs
* to roll cycle himself.
*
* Returns 0 if case of no errors.
*/
static int
resize_shared_value_storage(struct ip_fw_chain *ch)
{
struct tables_config *tcfg;
struct namedobj_instance *vi;
struct table_value *pval, *valuestate, *old_valuestate;
void *new_idx;
struct vdump_args da;
int new_blocks;
int val_size, val_size_old;
IPFW_UH_WLOCK_ASSERT(ch);
valuestate = NULL;
new_idx = NULL;
pval = (struct table_value *)ch->valuestate;
vi = CHAIN_TO_VI(ch);
tcfg = CHAIN_TO_TCFG(ch);
val_size = tcfg->val_size * 2;
if (val_size == (1 << 30))
return (ENOSPC);
IPFW_UH_WUNLOCK(ch);
valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,
M_WAITOK | M_ZERO);
ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,
&new_blocks);
IPFW_UH_WLOCK(ch);
/*
* Check if we still need to resize
*/
if (tcfg->val_size >= val_size)
goto done;
/* Update pointers and notify everyone we're changing @ch */
pval = (struct table_value *)ch->valuestate;
rollback_toperation_state(ch, ch);
/* Good. Let's merge */
memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);
ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
IPFW_WLOCK(ch);
/* Change pointers */
old_valuestate = ch->valuestate;
ch->valuestate = valuestate;
valuestate = old_valuestate;
ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
val_size_old = tcfg->val_size;
tcfg->val_size = val_size;
val_size = val_size_old;
IPFW_WUNLOCK(ch);
/* Update pointers to reflect resize */
memset(&da, 0, sizeof(da));
da.pval = (struct table_value *)ch->valuestate;
ipfw_objhash_foreach(vi, update_tvalue, &da);
done:
free(valuestate, M_IPFW);
ipfw_objhash_bitmap_free(new_idx, new_blocks);
return (0);
}
/*
* Drops reference for table value with index @kidx, stored in @pval and
* @vi. Frees value if it has no references.
*/
static void
unref_table_value(struct namedobj_instance *vi, struct table_value *pval,
uint32_t kidx)
{
struct table_val_link *ptvl;
KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));
if (--pval[kidx].refcnt > 0)
return;
/* Last reference, delete item */
ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);
KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));
ipfw_objhash_del(vi, &ptvl->no);
ipfw_objhash_free_idx(vi, kidx);
free(ptvl, M_IPFW);
}
struct flush_args {
struct ip_fw_chain *ch;
struct table_algo *ta;
struct table_info *ti;
void *astate;
ipfw_obj_tentry tent;
};
static int
unref_table_value_cb(void *e, void *arg)
{
struct flush_args *fa;
struct ip_fw_chain *ch;
struct table_algo *ta;
ipfw_obj_tentry *tent;
int error;
fa = (struct flush_args *)arg;
ta = fa->ta;
memset(&fa->tent, 0, sizeof(fa->tent));
tent = &fa->tent;
error = ta->dump_tentry(fa->astate, fa->ti, e, tent);
if (error != 0)
return (error);
ch = fa->ch;
unref_table_value(CHAIN_TO_VI(ch),
(struct table_value *)ch->valuestate, tent->v.kidx);
return (0);
}
/*
* Drop references for each value used in @tc.
*/
void
ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
struct table_algo *ta, void *astate, struct table_info *ti)
{
struct flush_args fa;
IPFW_UH_WLOCK_ASSERT(ch);
memset(&fa, 0, sizeof(fa));
fa.ch = ch;
fa.ta = ta;
fa.astate = astate;
fa.ti = ti;
ta->foreach(astate, ti, unref_table_value_cb, &fa);
}
/*
* Table operation state handler.
* Called when we are going to change something in @tc which
* may lead to inconsistencies in on-going table data addition.
*
* Here we rollback all already committed state (table values, currently)
* and set "modified" field to non-zero value to indicate
* that we need to restart original operation.
*/
void
rollback_table_values(struct tableop_state *ts)
{
struct ip_fw_chain *ch;
struct table_value *pval;
struct tentry_info *ptei;
struct namedobj_instance *vi;
int i;
ch = ts->ch;
IPFW_UH_WLOCK_ASSERT(ch);
/* Get current table value pointer */
get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
for (i = 0; i < ts->count; i++) {
ptei = &ts->tei[i];
if (ptei->value == 0)
continue;
unref_table_value(vi, pval, ptei->value);
}
}
/*
* Allocate new value index in either shared or per-table array.
* Function may drop/reacquire UH lock.
*
* Returns 0 on success.
*/
static int
alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,
struct namedobj_instance *vi, uint16_t *pvidx)
{
int error, vlimit;
uint16_t vidx;
IPFW_UH_WLOCK_ASSERT(ch);
error = ipfw_objhash_alloc_idx(vi, &vidx);
if (error != 0) {
/*
* We need to resize array. This involves
* lock/unlock, so we need to check "modified"
* state.
*/
ts->opstate.func(ts->tc, &ts->opstate);
error = resize_shared_value_storage(ch);
return (error); /* ts->modified should be set, we will restart */
}
vlimit = ts->ta->vlimit;
if (vlimit != 0 && vidx >= vlimit) {
/*
* Algorithm is not able to store given index.
* We have to rollback state, start using
* per-table value array or return error
* if we're already using it.
*
* TODO: do not rollback state if
* atomicity is not required.
*/
if (ts->vshared != 0) {
/* shared -> per-table */
return (ENOSPC); /* TODO: proper error */
}
/* per-table. Fail for now. */
return (ENOSPC); /* TODO: proper error */
}
*pvidx = vidx;
return (0);
}
/*
* Drops value reference for unused values (updates, deletes, partially
* successful adds or rollbacks).
*/
void
ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
struct tentry_info *tei, uint32_t count, int rollback)
{
int i;
struct tentry_info *ptei;
struct table_value *pval;
struct namedobj_instance *vi;
/*
* We have two slightly different ADD cases here:
* either (1) we are successful / partially successful,
* in that case we need
* * to ignore ADDED entries values
* * rollback every other values (either UPDATED since
* old value has been stored there, or some failure like
* EXISTS or LIMIT or simply "ignored" case.
*
* (2): atomic rollback of partially successful operation
* in that case we simply need to unref all entries.
*
* DELETE case is simpler: no atomic support there, so
* we simply unref all non-zero values.
*/
/*
* Get current table value pointers.
* XXX: Properly read vshared
*/
get_value_ptrs(ch, tc, 1, &pval, &vi);
for (i = 0; i < count; i++) {
ptei = &tei[i];
if (ptei->value == 0) {
/*
* We may be deleting non-existing record.
* Skip.
*/
continue;
}
if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {
ptei->value = 0;
continue;
}
unref_table_value(vi, pval, ptei->value);
ptei->value = 0;
}
}
/*
* Main function used to link values of entries going to be added,
* to the index. Since we may perform many UH locks drops/acquires,
* handle changes by checking tablestate "modified" field.
*
* Success: return 0.
*/
int
ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts)
{
int error, i, found;
struct namedobj_instance *vi;
struct table_config *tc;
struct tentry_info *tei, *ptei;
uint32_t count, vlimit;
uint16_t vidx;
struct table_val_link *ptv;
struct table_value tval, *pval;
/*
* Stage 1: reference all existing values and
* save their indices.
*/
IPFW_UH_WLOCK_ASSERT(ch);
get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
error = 0;
found = 0;
vlimit = ts->ta->vlimit;
vidx = 0;
tc = ts->tc;
tei = ts->tei;
count = ts->count;
for (i = 0; i < count; i++) {
ptei = &tei[i];
ptei->value = 0; /* Ensure value is always 0 in the beginning */
mask_table_value(ptei->pvalue, &tval, ts->vmask);
ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
(char *)&tval);
if (ptv == NULL)
continue;
/* Deal with vlimit later */
if (vlimit > 0 && vlimit <= ptv->no.kidx)
continue;
/* Value found. Bump refcount */
ptv->pval->refcnt++;
ptei->value = ptv->no.kidx;
found++;
}
if (ts->count == found) {
/* We've found all values , no need ts create new ones */
return (0);
}
/*
* we have added some state here, let's attach operation
* state ts the list ts be able ts rollback if necessary.
*/
add_toperation_state(ch, ts);
/* Ensure table won't disappear */
tc_ref(tc);
IPFW_UH_WUNLOCK(ch);
/*
* Stage 2: allocate objects for non-existing values.
*/
for (i = 0; i < count; i++) {
ptei = &tei[i];
if (ptei->value != 0)
continue;
if (ptei->ptv != NULL)
continue;
ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,
M_WAITOK | M_ZERO);
}
/*
* Stage 3: allocate index numbers for new values
* and link them to index.
*/
IPFW_UH_WLOCK(ch);
tc_unref(tc);
del_toperation_state(ch, ts);
if (ts->modified != 0) {
/*
* In general, we should free all state/indexes here
* and return. However, we keep allocated state instead
* to ensure we achieve some progress on each restart.
*/
return (0);
}
KASSERT(pval == ch->valuestate, ("resize_storage() notify failure"));
/* Let's try to link values */
for (i = 0; i < count; i++) {
ptei = &tei[i];
/* Check if record has appeared */
mask_table_value(ptei->pvalue, &tval, ts->vmask);
ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
(char *)&tval);
if (ptv != NULL) {
ptv->pval->refcnt++;
ptei->value = ptv->no.kidx;
continue;
}
/* May perform UH unlock/lock */
error = alloc_table_vidx(ch, ts, vi, &vidx);
if (error != 0) {
ts->opstate.func(ts->tc, &ts->opstate);
return (error);
}
/* value storage resize has happened, return */
if (ts->modified != 0)
return (0);
/* Finally, we have allocated valid index, let's add entry */
ptei->value = vidx;
ptv = (struct table_val_link *)ptei->ptv;
ptei->ptv = NULL;
ptv->no.kidx = vidx;
ptv->no.name = (char *)&pval[vidx];
ptv->pval = &pval[vidx];
memcpy(ptv->pval, &tval, sizeof(struct table_value));
pval[vidx].refcnt = 1;
ipfw_objhash_add(vi, &ptv->no);
}
return (0);
}
/*
* Compatibility function used to import data from old
* IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes.
*/
void
ipfw_import_table_value_legacy(uint32_t value, struct table_value *v)
{
memset(v, 0, sizeof(*v));
v->tag = value;
v->pipe = value;
v->divert = value;
v->skipto = value;
v->netgraph = value;
v->fib = value;
v->nat = value;
v->nh4 = value; /* host format */
v->dscp = value;
v->limit = value;
}
/*
* Export data to legacy table dumps opcodes.
*/
uint32_t
ipfw_export_table_value_legacy(struct table_value *v)
{
/*
* TODO: provide more compatibility depending on
* vmask value.
*/
return (v->tag);
}
/*
* Imports table value from current userland format.
* Saves value in kernel format to the same place.
*/
void
ipfw_import_table_value_v1(ipfw_table_value *iv)
{
struct table_value v;
memset(&v, 0, sizeof(v));
v.tag = iv->tag;
v.pipe = iv->pipe;
v.divert = iv->divert;
v.skipto = iv->skipto;
v.netgraph = iv->netgraph;
v.fib = iv->fib;
v.nat = iv->nat;
v.dscp = iv->dscp;
v.nh4 = iv->nh4;
v.nh6 = iv->nh6;
v.limit = iv->limit;
v.zoneid = iv->zoneid;
memcpy(iv, &v, sizeof(ipfw_table_value));
}
/*
* Export real table value @v to current userland format.
* Note that @v and @piv may point to the same memory.
*/
void
ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)
{
ipfw_table_value iv;
memset(&iv, 0, sizeof(iv));
iv.tag = v->tag;
iv.pipe = v->pipe;
iv.divert = v->divert;
iv.skipto = v->skipto;
iv.netgraph = v->netgraph;
iv.fib = v->fib;
iv.nat = v->nat;
iv.dscp = v->dscp;
iv.limit = v->limit;
iv.nh4 = v->nh4;
iv.nh6 = v->nh6;
iv.zoneid = v->zoneid;
memcpy(piv, &iv, sizeof(iv));
}
/*
* Exports real value data into ipfw_table_value structure.
* Utilizes "spare1" field to store kernel index.
*/
static int
dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
{
struct vdump_args *da;
struct table_val_link *ptv;
struct table_value *v;
da = (struct vdump_args *)arg;
ptv = (struct table_val_link *)no;
v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));
/* Out of memory, returning */
if (v == NULL) {
da->error = ENOMEM;
return (ENOMEM);
}
memcpy(v, ptv->pval, sizeof(*v));
v->spare1 = ptv->no.kidx;
return (0);
}
/*
* Dumps all shared/table value data
* Data layout (v1)(current):
* Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
* Reply: [ ipfw_obj_lheader ipfw_table_value x N ]
*
* Returns 0 on success
*/
static int
list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
struct sockopt_data *sd)
{
struct _ipfw_obj_lheader *olh;
struct namedobj_instance *vi;
struct vdump_args da;
uint32_t count, size;
olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
if (olh == NULL)
return (EINVAL);
if (sd->valsize < olh->size)
return (EINVAL);
IPFW_UH_RLOCK(ch);
vi = CHAIN_TO_VI(ch);
count = ipfw_objhash_count(vi);
size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);
/* Fill in header regadless of buffer size */
olh->count = count;
olh->objsize = sizeof(ipfw_table_value);
if (size > olh->size) {
olh->size = size;
IPFW_UH_RUNLOCK(ch);
return (ENOMEM);
}
olh->size = size;
/*
* Do the actual value dump
*/
memset(&da, 0, sizeof(da));
da.ch = ch;
da.sd = sd;
ipfw_objhash_foreach(vi, dump_tvalue, &da);
IPFW_UH_RUNLOCK(ch);
return (0);
}
void
ipfw_table_value_init(struct ip_fw_chain *ch, int first)
{
struct tables_config *tcfg;
ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),
M_IPFW, M_WAITOK | M_ZERO);
tcfg = ch->tblcfg;
tcfg->val_size = VALDATA_START_SIZE;
tcfg->valhash = ipfw_objhash_create(tcfg->val_size);
ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,
cmp_table_value);
IPFW_ADD_SOPT_HANDLER(first, scodes);
}
static int
destroy_value(struct namedobj_instance *ni, struct named_object *no,
void *arg)
{
free(no, M_IPFW);
return (0);
}
void
ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)
{
IPFW_DEL_SOPT_HANDLER(last, scodes);
free(ch->valuestate, M_IPFW);
ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);
ipfw_objhash_destroy(CHAIN_TO_VI(ch));
}