Fix the design problem with delayed algorithm sync.

Currently, if the immutable algorithm like bsearch or radix_lockless receives rtable update notification, it schedules algorithm rebuild. This rebuild is executed by the callout after ~50 milliseconds. It is possible that a script adding an interface address and than route with the gateway bound to that address will fail. It can happen due to the fact that fib is not updated by the time the route addition request arrives. Fix this by allowing synchronous algorithm rebuilds based on certain conditions. By default, these conditions assume: 1) less than net.route.algo.fib_sync_limit=100 routes 2) routes without gateway. * Move algo instance build entirely under rib WLOCK. Rib lock is only used for control plane (except radix algo, but there are no rebuilds). * Add rib_walk_ext_locked() function to allow RIB iteration with rib lock already held. * Fix rare potential callout use-after-free for fds by binding fd callout to the relevant rib rmlock. In that case, callout_stop() under rib WLOCK guarantees no callout will be executed afterwards. MFC after: 3 days
2021-01-30 22:45:46 +00:00 · 2021-01-30 22:45:46 +00:00 · 151ec796a2
commit 151ec796a2
parent dd9163003c
3 changed files with 87 additions and 42 deletions
--- a/sys/net/route/fib_algo.c
+++ b/sys/net/route/fib_algo.c
@ -105,6 +105,11 @@ SYSCTL_DECL(_net_route);
 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
    "Fib algorithm lookups");

+VNET_DEFINE(int, fib_sync_limit) = 100;
+#define	V_fib_sync_limit	VNET(fib_sync_limit)
+SYSCTL_INT(_net_route_algo, OID_AUTO, fib_sync_limit, CTLFLAG_RW | CTLFLAG_VNET,
+    &VNET_NAME(fib_sync_limit), 0, "Guarantee synchronous fib till route limit");
+
 #ifdef INET6
 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
@ -465,7 +470,8 @@ static void
 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
 {

-	FIB_MOD_LOCK();
+	RIB_WLOCK_ASSERT(fd->fd_rh);
+
 	if (!fd->fd_need_rebuild) {
 		fd->fd_need_rebuild = true;

@ -477,30 +483,52 @@ schedule_fd_rebuild(struct fib_data *fd, const char *reason)
 		    reason, fd->fd_failed_rebuilds);
 		schedule_callout(fd, callout_calc_delay_ms(fd));
 	}
-	FIB_MOD_UNLOCK();
 }

 static void
 schedule_algo_eval(struct fib_data *fd)
 {

+	RIB_WLOCK_ASSERT(fd->fd_rh);
+
 	if (fd->fd_num_changes++ == 0) {
 		/* Start callout to consider switch */
-		FIB_MOD_LOCK();
 		if (!callout_pending(&fd->fd_callout))
 			schedule_callout(fd, ALGO_EVAL_DELAY_MS);
-		FIB_MOD_UNLOCK();
 	} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
 		/* Reset callout to exec immediately */
-		FIB_MOD_LOCK();
 		if (!fd->fd_need_rebuild) {
 			fd->fd_force_eval = true;
 			schedule_callout(fd, 1);
 		}
-		FIB_MOD_UNLOCK();
 	}
 }

+static bool
+need_immediate_rebuild(struct fib_data *fd, struct rib_cmd_info *rc)
+{
+	struct nhop_object *nh;
+
+	if ((V_fib_sync_limit == 0) || (fd->fd_rh->rnh_prefixes <= V_fib_sync_limit))
+		return (true);
+
+	/* Sync addition/removal of interface routes */
+	switch (rc->rc_cmd) {
+	case RTM_ADD:
+		nh = rc->rc_nh_new;
+		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
+			return (true);
+		break;
+	case RTM_DELETE:
+		nh = rc->rc_nh_old;
+		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
+			return (true);
+		break;
+	}
+
+	return (false);
+}
+
 /*
 * Rib subscription handler. Checks if the algorithm is ready to
 *  receive updates, handles nexthop refcounting and passes change
@ -559,8 +587,16 @@ handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
 		 * Algo is not able to apply the update.
 		 * Schedule algo rebuild.
 		 */
+		if (!need_immediate_rebuild(fd, rc)) {
 			schedule_fd_rebuild(fd, "algo requested rebuild");
 			break;
+		}
+
+		fd->fd_need_rebuild = true;
+		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
+		if (!rebuild_fd(fd))
+			schedule_fd_rebuild(fd, "sync rebuild failed");
+		break;
 	case FLM_ERROR:

 		/*
@ -678,7 +714,7 @@ sync_algo(struct fib_data *fd)
 		.result = FLM_SUCCESS,
 	};

-	rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
+	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);

 	FD_PRINTF(LOG_INFO, fd,
 	    "initial dump completed (rtable version: %d), result: %s",
@ -702,6 +738,7 @@ schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
 	bool is_dead;

 	NET_EPOCH_ASSERT();
+	RIB_WLOCK_ASSERT(fd->fd_rh);

 	FIB_MOD_LOCK();
 	is_dead = fd->fd_dead;
@ -718,27 +755,13 @@ schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
 	FD_PRINTF(LOG_INFO, fd, "DETACH");

 	if (fd->fd_rs != NULL)
-		rib_unsibscribe(fd->fd_rs);
+		rib_unsibscribe_locked(fd->fd_rs);

 	/*
 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
 	 * will be executed, hence no _new_ callout schedules will happen.
-	 *
-	 * There can be 2 possible scenarious here:
-	 * 1) we're running inside a callout when we're deleting ourselves
-	 *  due to migration to a newer fd
-	 * 2) we're running from rt_table_destroy() and callout is scheduled
-	 *  for execution OR is executing
-	 *
-	 * For (2) we need to wait for the callout termination, as the routing table
-	 * will be destroyed after this function returns.
-	 * For (1) we cannot call drain, but can ensure that this is the last invocation.
 	 */
-
-	if (in_callout)
 	callout_stop(&fd->fd_callout);
-	else
-		callout_drain(&fd->fd_callout);

 	epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
 	    &fd->fd_epoch_ctx);
@ -774,7 +797,11 @@ fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
 	/* Pass 2: remove each entry */
 	NET_EPOCH_ENTER(et);
 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
+		if (!in_callout)
+			RIB_WLOCK(fd->fd_rh);
 		schedule_destroy_fd_instance(fd, in_callout);
+		if (!in_callout)
+			RIB_WUNLOCK(fd->fd_rh);
 	}
 	NET_EPOCH_EXIT(et);
 }
@ -870,7 +897,7 @@ try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 	fd->fd_gen = ++fib_gen;
 	fd->fd_family = rh->rib_family;
 	fd->fd_fibnum = rh->rib_fibnum;
-	callout_init(&fd->fd_callout, 1);
+	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
 	fd->fd_vnet = curvnet;
 	fd->fd_flm = flm;

@ -908,8 +935,8 @@ try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,

 	/* Try to subscribe */
 	if (flm->flm_change_rib_item_cb != NULL) {
-		fd->fd_rs = rib_subscribe_internal(fd->fd_rh,
-		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0);
+		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
+		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
 		if (fd->fd_rs == NULL) {
 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
 			return (FLM_REBUILD);
@ -946,13 +973,14 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
    struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
 {
 	struct fib_data *prev_fd, *new_fd;
-	struct epoch_tracker et;
 	enum flm_op_result result;

+	NET_EPOCH_ASSERT();
+	RIB_WLOCK_ASSERT(rh);
+
 	prev_fd = orig_fd;
 	new_fd = NULL;
 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
-		NET_EPOCH_ENTER(et);
 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);

 		if ((result == FLM_SUCCESS) && attach)
@ -962,7 +990,6 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 			schedule_destroy_fd_instance(prev_fd, false);
 			prev_fd = NULL;
 		}
-		NET_EPOCH_EXIT(et);

 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
 		    print_op_result(result));
@ -990,14 +1017,12 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 		if (result == FLM_ERROR)
 			flm_error_add(flm, rh->rib_fibnum);

-		NET_EPOCH_ENTER(et);
 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
 			schedule_destroy_fd_instance(prev_fd, false);
 		if (new_fd != NULL) {
 			schedule_destroy_fd_instance(new_fd, false);
 			new_fd = NULL;
 		}
-		NET_EPOCH_EXIT(et);
 	}

 	*pfd = new_fd;
@ -1013,12 +1038,15 @@ static void
 rebuild_fd_callout(void *_data)
 {
 	struct fib_data *fd = (struct fib_data *)_data;
+	struct epoch_tracker et;

 	FD_PRINTF(LOG_INFO, fd, "running callout rebuild");

+	NET_EPOCH_ENTER(et);
 	CURVNET_SET(fd->fd_vnet);
 	rebuild_fd(fd);
 	CURVNET_RESTORE();
+	NET_EPOCH_EXIT(et);
 }

 /*
@ -1030,17 +1058,16 @@ rebuild_fd(struct fib_data *fd)
 {
 	struct fib_data *fd_new, *fd_tmp;
 	struct fib_lookup_module *flm_new = NULL;
-	struct epoch_tracker et;
 	enum flm_op_result result;
 	bool need_rebuild = false;

+	NET_EPOCH_ASSERT();
+	RIB_WLOCK_ASSERT(fd->fd_rh);

-	FIB_MOD_LOCK();
 	need_rebuild = fd->fd_need_rebuild;
 	fd->fd_need_rebuild = false;
 	fd->fd_force_eval = false;
 	fd->fd_num_changes = 0;
-	FIB_MOD_UNLOCK();

 	/* First, check if we're still OK to use this algo */
 	if (!is_algo_fixed(fd->fd_rh))
@ -1069,9 +1096,7 @@ rebuild_fd(struct fib_data *fd)
 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");

 	/* Remove old instance */
-	NET_EPOCH_ENTER(et);
 	schedule_destroy_fd_instance(fd, true);
-	NET_EPOCH_EXIT(et);

 	return (true);
 }
@ -1116,6 +1141,7 @@ set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl
 	char old_algo_name[32], algo_name[32];
 	struct rib_head *rh = NULL;
 	enum flm_op_result result;
+	struct epoch_tracker et;
 	int error;

 	/* Fetch current algo/rib for af/family */
@ -1149,7 +1175,11 @@ set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl
 	}

 	fd = NULL;
+	NET_EPOCH_ENTER(et);
+	RIB_WLOCK(rh);
 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
+	RIB_WUNLOCK(rh);
+	NET_EPOCH_EXIT(et);
 	fib_unref_algo(flm);
 	if (result != FLM_SUCCESS)
 		return (EINVAL);
@ -1558,6 +1588,7 @@ fib_select_algo_initial(struct rib_head *rh)
 	struct fib_lookup_module *flm;
 	struct fib_data *fd = NULL;
 	enum flm_op_result result;
+	struct epoch_tracker et;
 	int error = 0;

 	flm = fib_check_best_algo(rh, NULL);
@ -1567,7 +1598,12 @@ fib_select_algo_initial(struct rib_head *rh)
 	}
 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);

+	NET_EPOCH_ENTER(et);
+	RIB_WLOCK(rh);
 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
+	RIB_WUNLOCK(rh);
+	NET_EPOCH_EXIT(et);
+
 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
 	if (result == FLM_SUCCESS) {

--- a/sys/net/route/route_ctl.h
+++ b/sys/net/route/route_ctl.h
@ -75,6 +75,8 @@ void rib_walk_ext(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f,
    rib_walk_hook_f_t *hook_f, void *arg);
 void rib_walk_ext_internal(struct rib_head *rnh, bool wlock,
    rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg);
+void rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
+    rib_walk_hook_f_t *hook_f, void *arg);

 void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f,
    void *arg, bool report);
--- a/sys/net/route/route_helpers.c
+++ b/sys/net/route/route_helpers.c
@ -67,6 +67,17 @@ __FBSDID("$FreeBSD$");
 * RIB helper functions.
 */

+void
+rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
+    rib_walk_hook_f_t *hook_f, void *arg)
+{
+	if (hook_f != NULL)
+		hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
+	rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
+	if (hook_f != NULL)
+		hook_f(rnh, RIB_WALK_HOOK_POST, arg);
+}
+
 /*
 * Calls @wa_f with @arg for each entry in the table specified by
 * @af and @fibnum.
@ -86,11 +97,7 @@ rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
 		RIB_WLOCK(rnh);
 	else
 		RIB_RLOCK(rnh);
-	if (hook_f != NULL)
-		hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
-	rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
-	if (hook_f != NULL)
-		hook_f(rnh, RIB_WALK_HOOK_POST, arg);
+	rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
 	if (wlock)
 		RIB_WUNLOCK(rnh);
 	else