lacp: short timeout erroneously declares link-flapping
Panasas was seeing a higher-than-expected number of link-flap events. After joint debugging with the switch vendor, we determined there were problems on both sides; either of which might cause the occasional event, but together caused lots of them. On the switch side, an internal queuing issue was causing LACP PDUs -- which should be sent every second, in short-timeout mode -- to sometimes be sent slightly later than they should have been. In some cases, two successive PDUs were late, but we never saw three late PDUs in a row. On the FreeBSD side, we saw a link-flap event every time there were two late PDUs, while the spec says that it takes *three* seconds of downtime to trigger that event. It turns out that if a PDU was received shortly before the timer code was run, it would decrement less than a full second after the PDU arrived. Then two delayed PDUs would cause two additional decrements, causing it to reach zero less than three seconds after the most-recent on-time PDU. The solution is to note the time a PDU arrives, and only decrement if at least a full second has elapsed since then. Reported by: Greg Foster <gfoster@panasas.com> Reviewed by: gallatin Tested by: Greg Foster <gfoster@panasas.com> MFC after: 3 days Sponsored by: Panasas Differential Revision: https://reviews.freebsd.org/D35070
This commit is contained in:
parent
fa160738bd
commit
00a80538b4
@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/lock.h>
|
||||
#include <sys/rwlock.h>
|
||||
#include <sys/taskqueue.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <net/if.h>
|
||||
#include <net/if_var.h>
|
||||
@ -1731,6 +1732,7 @@ lacp_sm_rx(struct lacp_port *lp, const struct lacpdu *du)
|
||||
* EXPIRED, DEFAULTED, CURRENT -> CURRENT
|
||||
*/
|
||||
|
||||
microuptime(&lp->lp_last_lacpdu_rx);
|
||||
lacp_sm_rx_update_selected(lp, du);
|
||||
lacp_sm_rx_update_ntt(lp, du);
|
||||
lacp_sm_rx_record_pdu(lp, du);
|
||||
@ -1940,14 +1942,26 @@ static void
|
||||
lacp_run_timers(struct lacp_port *lp)
|
||||
{
|
||||
int i;
|
||||
struct timeval time_diff;
|
||||
|
||||
for (i = 0; i < LACP_NTIMER; i++) {
|
||||
KASSERT(lp->lp_timer[i] >= 0,
|
||||
("invalid timer value %d", lp->lp_timer[i]));
|
||||
if (lp->lp_timer[i] == 0) {
|
||||
continue;
|
||||
} else if (--lp->lp_timer[i] <= 0) {
|
||||
if (lacp_timer_funcs[i]) {
|
||||
} else {
|
||||
if (i == LACP_TIMER_CURRENT_WHILE) {
|
||||
microuptime(&time_diff);
|
||||
timevalsub(&time_diff, &lp->lp_last_lacpdu_rx);
|
||||
if (time_diff.tv_sec) {
|
||||
/* At least one sec has elapsed since last LACP packet. */
|
||||
--lp->lp_timer[i];
|
||||
}
|
||||
} else {
|
||||
--lp->lp_timer[i];
|
||||
}
|
||||
|
||||
if ((lp->lp_timer[i] <= 0) && (lacp_timer_funcs[i])) {
|
||||
(*lacp_timer_funcs[i])(lp);
|
||||
}
|
||||
}
|
||||
|
@ -222,6 +222,7 @@ struct lacp_port {
|
||||
#define lp_key lp_actor.lip_key
|
||||
#define lp_systemid lp_actor.lip_systemid
|
||||
struct timeval lp_last_lacpdu;
|
||||
struct timeval lp_last_lacpdu_rx;
|
||||
int lp_lacpdu_sent;
|
||||
enum lacp_mux_state lp_mux_state;
|
||||
enum lacp_selected lp_selected;
|
||||
|
Loading…
Reference in New Issue
Block a user