numam-dpdk/lib/librte_timer/rte_timer.c
Robert Sanford a4b7a5a45c timer: fix race condition
Eliminate problematic race condition in rte_timer_manage() that can
lead to corruption of per-lcore pending-lists (implemented as
skip-lists). The race condition occurs when rte_timer_manage() expires
multiple timers on lcore A, while lcore B simultaneously invokes
rte_timer_reset() for one of the expiring timers (other than the first
one).

Lcore A splits its pending-list, creating a local list of expired timers
linked through their sl_next[0] pointers, and sets the first expired
timer to the RUNNING state, all during one list-lock round trip.
Lcore A then unlocks the list-lock to run the first callback, and that
is when A and B can have different interpretations of the subsequent
expired timers' true state. Lcore B sees an expired timer still in the
PENDING state, atomically changes the timer to the CONFIG state, locks
lcore A's list-lock, and reinserts the timer into A's pending-list.
The two lcores try to use the same next-pointers to maintain both lists!

Our solution is to remove expired timers from the pending-list and try
to set them all to the RUNNING state in one atomic step, i.e.,
rte_timer_manage() should perform these two actions within one
ownership of the list-lock.

After splitting the pending-list at the current point in time and trying
to set all expired timers to the RUNNING state, we must put back into
the pending-list any timers that we failed to set to the RUNNING state,
all while still holding the list-lock. It is then safe to release the
lock and run the callback functions for all expired timers that remain
on our local run-list.

Signed-off-by: Robert Sanford <rsanford@akamai.com>
2015-08-03 12:43:01 +02:00

638 lines
19 KiB
C

/*-
* BSD LICENSE
*
* Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#include <assert.h>
#include <sys/queue.h>
#include <rte_atomic.h>
#include <rte_common.h>
#include <rte_cycles.h>
#include <rte_per_lcore.h>
#include <rte_memory.h>
#include <rte_memzone.h>
#include <rte_launch.h>
#include <rte_eal.h>
#include <rte_per_lcore.h>
#include <rte_lcore.h>
#include <rte_branch_prediction.h>
#include <rte_spinlock.h>
#include <rte_random.h>
#include "rte_timer.h"
LIST_HEAD(rte_timer_list, rte_timer);
struct priv_timer {
struct rte_timer pending_head; /**< dummy timer instance to head up list */
rte_spinlock_t list_lock; /**< lock to protect list access */
/** per-core variable that true if a timer was updated on this
* core since last reset of the variable */
int updated;
/** track the current depth of the skiplist */
unsigned curr_skiplist_depth;
unsigned prev_lcore; /**< used for lcore round robin */
#ifdef RTE_LIBRTE_TIMER_DEBUG
/** per-lcore statistics */
struct rte_timer_debug_stats stats;
#endif
} __rte_cache_aligned;
/** per-lcore private info for timers */
static struct priv_timer priv_timer[RTE_MAX_LCORE];
/* when debug is enabled, store some statistics */
#ifdef RTE_LIBRTE_TIMER_DEBUG
#define __TIMER_STAT_ADD(name, n) do { \
unsigned __lcore_id = rte_lcore_id(); \
if (__lcore_id < RTE_MAX_LCORE) \
priv_timer[__lcore_id].stats.name += (n); \
} while(0)
#else
#define __TIMER_STAT_ADD(name, n) do {} while(0)
#endif
/* Init the timer library. */
void
rte_timer_subsystem_init(void)
{
unsigned lcore_id;
/* since priv_timer is static, it's zeroed by default, so only init some
* fields.
*/
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
rte_spinlock_init(&priv_timer[lcore_id].list_lock);
priv_timer[lcore_id].prev_lcore = lcore_id;
}
}
/* Initialize the timer handle tim for use */
void
rte_timer_init(struct rte_timer *tim)
{
union rte_timer_status status;
status.state = RTE_TIMER_STOP;
status.owner = RTE_TIMER_NO_OWNER;
tim->status.u32 = status.u32;
}
/*
* if timer is pending or stopped (or running on the same core than
* us), mark timer as configuring, and on success return the previous
* status of the timer
*/
static int
timer_set_config_state(struct rte_timer *tim,
union rte_timer_status *ret_prev_status)
{
union rte_timer_status prev_status, status;
int success = 0;
unsigned lcore_id;
lcore_id = rte_lcore_id();
/* wait that the timer is in correct status before update,
* and mark it as being configured */
while (success == 0) {
prev_status.u32 = tim->status.u32;
/* timer is running on another core, exit */
if (prev_status.state == RTE_TIMER_RUNNING &&
prev_status.owner != (uint16_t)lcore_id)
return -1;
/* timer is being configured on another core */
if (prev_status.state == RTE_TIMER_CONFIG)
return -1;
/* here, we know that timer is stopped or pending,
* mark it atomically as being configured */
status.state = RTE_TIMER_CONFIG;
status.owner = (int16_t)lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
}
ret_prev_status->u32 = prev_status.u32;
return 0;
}
/*
* if timer is pending, mark timer as running
*/
static int
timer_set_running_state(struct rte_timer *tim)
{
union rte_timer_status prev_status, status;
unsigned lcore_id = rte_lcore_id();
int success = 0;
/* wait that the timer is in correct status before update,
* and mark it as running */
while (success == 0) {
prev_status.u32 = tim->status.u32;
/* timer is not pending anymore */
if (prev_status.state != RTE_TIMER_PENDING)
return -1;
/* here, we know that timer is stopped or pending,
* mark it atomically as beeing configured */
status.state = RTE_TIMER_RUNNING;
status.owner = (int16_t)lcore_id;
success = rte_atomic32_cmpset(&tim->status.u32,
prev_status.u32,
status.u32);
}
return 0;
}
/*
* Return a skiplist level for a new entry.
* This probabalistically gives a level with p=1/4 that an entry at level n
* will also appear at level n+1.
*/
static uint32_t
timer_get_skiplist_level(unsigned curr_depth)
{
#ifdef RTE_LIBRTE_TIMER_DEBUG
static uint32_t i, count = 0;
static uint32_t levels[MAX_SKIPLIST_DEPTH] = {0};
#endif
/* probability value is 1/4, i.e. all at level 0, 1 in 4 is at level 1,
* 1 in 16 at level 2, 1 in 64 at level 3, etc. Calculated using lowest
* bit position of a (pseudo)random number.
*/
uint32_t rand = rte_rand() & (UINT32_MAX - 1);
uint32_t level = rand == 0 ? MAX_SKIPLIST_DEPTH : (rte_bsf32(rand)-1) / 2;
/* limit the levels used to one above our current level, so we don't,
* for instance, have a level 0 and a level 7 without anything between
*/
if (level > curr_depth)
level = curr_depth;
if (level >= MAX_SKIPLIST_DEPTH)
level = MAX_SKIPLIST_DEPTH-1;
#ifdef RTE_LIBRTE_TIMER_DEBUG
count ++;
levels[level]++;
if (count % 10000 == 0)
for (i = 0; i < MAX_SKIPLIST_DEPTH; i++)
printf("Level %u: %u\n", (unsigned)i, (unsigned)levels[i]);
#endif
return level;
}
/*
* For a given time value, get the entries at each level which
* are <= that time value.
*/
static void
timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
struct rte_timer **prev)
{
unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
prev[lvl] = &priv_timer[tim_lcore].pending_head;
while(lvl != 0) {
lvl--;
prev[lvl] = prev[lvl+1];
while (prev[lvl]->sl_next[lvl] &&
prev[lvl]->sl_next[lvl]->expire <= time_val)
prev[lvl] = prev[lvl]->sl_next[lvl];
}
}
/*
* Given a timer node in the skiplist, find the previous entries for it at
* all skiplist levels.
*/
static void
timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
struct rte_timer **prev)
{
int i;
/* to get a specific entry in the list, look for just lower than the time
* values, and then increment on each level individually if necessary
*/
timer_get_prev_entries(tim->expire - 1, tim_lcore, prev);
for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
while (prev[i]->sl_next[i] != NULL &&
prev[i]->sl_next[i] != tim &&
prev[i]->sl_next[i]->expire <= tim->expire)
prev[i] = prev[i]->sl_next[i];
}
}
/*
* add in list, lock if needed
* timer must be in config state
* timer must not be in a list
*/
static void
timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
{
unsigned lcore_id = rte_lcore_id();
unsigned lvl;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
/* if timer needs to be scheduled on another core, we need to
* lock the list; if it is on local core, we need to lock if
* we are not called from rte_timer_manage() */
if (tim_lcore != lcore_id || !local_is_locked)
rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
/* find where exactly this element goes in the list of elements
* for each depth. */
timer_get_prev_entries(tim->expire, tim_lcore, prev);
/* now assign it a new level and add at that level */
const unsigned tim_level = timer_get_skiplist_level(
priv_timer[tim_lcore].curr_skiplist_depth);
if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
priv_timer[tim_lcore].curr_skiplist_depth++;
lvl = tim_level;
while (lvl > 0) {
tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
prev[lvl]->sl_next[lvl] = tim;
lvl--;
}
tim->sl_next[0] = prev[0]->sl_next[0];
prev[0]->sl_next[0] = tim;
/* save the lowest list entry into the expire field of the dummy hdr
* NOTE: this is not atomic on 32-bit*/
priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
pending_head.sl_next[0]->expire;
if (tim_lcore != lcore_id || !local_is_locked)
rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
}
/*
* del from list, lock if needed
* timer must be in config state
* timer must be in a list
*/
static void
timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
int local_is_locked)
{
unsigned lcore_id = rte_lcore_id();
unsigned prev_owner = prev_status.owner;
int i;
struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
/* if timer needs is pending another core, we need to lock the
* list; if it is on local core, we need to lock if we are not
* called from rte_timer_manage() */
if (prev_owner != lcore_id || !local_is_locked)
rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
/* save the lowest list entry into the expire field of the dummy hdr.
* NOTE: this is not atomic on 32-bit */
if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
priv_timer[prev_owner].pending_head.expire =
((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
/* adjust pointers from previous entries to point past this */
timer_get_prev_entries_for_node(tim, prev_owner, prev);
for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
if (prev[i]->sl_next[i] == tim)
prev[i]->sl_next[i] = tim->sl_next[i];
}
/* in case we deleted last entry at a level, adjust down max level */
for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
priv_timer[prev_owner].curr_skiplist_depth --;
else
break;
if (prev_owner != lcore_id || !local_is_locked)
rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
}
/* Reset and start the timer associated with the timer handle (private func) */
static int
__rte_timer_reset(struct rte_timer *tim, uint64_t expire,
uint64_t period, unsigned tim_lcore,
rte_timer_cb_t fct, void *arg,
int local_is_locked)
{
union rte_timer_status prev_status, status;
int ret;
unsigned lcore_id = rte_lcore_id();
/* round robin for tim_lcore */
if (tim_lcore == (unsigned)LCORE_ID_ANY) {
if (lcore_id < RTE_MAX_LCORE) {
/* EAL thread with valid lcore_id */
tim_lcore = rte_get_next_lcore(
priv_timer[lcore_id].prev_lcore,
0, 1);
priv_timer[lcore_id].prev_lcore = tim_lcore;
} else
/* non-EAL thread do not run rte_timer_manage(),
* so schedule the timer on the first enabled lcore. */
tim_lcore = rte_get_next_lcore(LCORE_ID_ANY, 0, 1);
}
/* wait that the timer is in correct status before update,
* and mark it as being configured */
ret = timer_set_config_state(tim, &prev_status);
if (ret < 0)
return -1;
__TIMER_STAT_ADD(reset, 1);
if (prev_status.state == RTE_TIMER_RUNNING &&
lcore_id < RTE_MAX_LCORE) {
priv_timer[lcore_id].updated = 1;
}
/* remove it from list */
if (prev_status.state == RTE_TIMER_PENDING) {
timer_del(tim, prev_status, local_is_locked);
__TIMER_STAT_ADD(pending, -1);
}
tim->period = period;
tim->expire = expire;
tim->f = fct;
tim->arg = arg;
__TIMER_STAT_ADD(pending, 1);
timer_add(tim, tim_lcore, local_is_locked);
/* update state: as we are in CONFIG state, only us can modify
* the state so we don't need to use cmpset() here */
rte_wmb();
status.state = RTE_TIMER_PENDING;
status.owner = (int16_t)tim_lcore;
tim->status.u32 = status.u32;
return 0;
}
/* Reset and start the timer associated with the timer handle tim */
int
rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
enum rte_timer_type type, unsigned tim_lcore,
rte_timer_cb_t fct, void *arg)
{
uint64_t cur_time = rte_get_timer_cycles();
uint64_t period;
if (unlikely((tim_lcore != (unsigned)LCORE_ID_ANY) &&
!rte_lcore_is_enabled(tim_lcore)))
return -1;
if (type == PERIODICAL)
period = ticks;
else
period = 0;
return __rte_timer_reset(tim, cur_time + ticks, period, tim_lcore,
fct, arg, 0);
}
/* loop until rte_timer_reset() succeed */
void
rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks,
enum rte_timer_type type, unsigned tim_lcore,
rte_timer_cb_t fct, void *arg)
{
while (rte_timer_reset(tim, ticks, type, tim_lcore,
fct, arg) != 0)
rte_pause();
}
/* Stop the timer associated with the timer handle tim */
int
rte_timer_stop(struct rte_timer *tim)
{
union rte_timer_status prev_status, status;
unsigned lcore_id = rte_lcore_id();
int ret;
/* wait that the timer is in correct status before update,
* and mark it as being configured */
ret = timer_set_config_state(tim, &prev_status);
if (ret < 0)
return -1;
__TIMER_STAT_ADD(stop, 1);
if (prev_status.state == RTE_TIMER_RUNNING &&
lcore_id < RTE_MAX_LCORE) {
priv_timer[lcore_id].updated = 1;
}
/* remove it from list */
if (prev_status.state == RTE_TIMER_PENDING) {
timer_del(tim, prev_status, 0);
__TIMER_STAT_ADD(pending, -1);
}
/* mark timer as stopped */
rte_wmb();
status.state = RTE_TIMER_STOP;
status.owner = RTE_TIMER_NO_OWNER;
tim->status.u32 = status.u32;
return 0;
}
/* loop until rte_timer_stop() succeed */
void
rte_timer_stop_sync(struct rte_timer *tim)
{
while (rte_timer_stop(tim) != 0)
rte_pause();
}
/* Test the PENDING status of the timer handle tim */
int
rte_timer_pending(struct rte_timer *tim)
{
return tim->status.state == RTE_TIMER_PENDING;
}
/* must be called periodically, run all timer that expired */
void rte_timer_manage(void)
{
union rte_timer_status status;
struct rte_timer *tim, *next_tim;
struct rte_timer *run_first_tim, **pprev;
unsigned lcore_id = rte_lcore_id();
struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
uint64_t cur_time;
int i, ret;
/* timer manager only runs on EAL thread with valid lcore_id */
assert(lcore_id < RTE_MAX_LCORE);
__TIMER_STAT_ADD(manage, 1);
/* optimize for the case where per-cpu list is empty */
if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
return;
cur_time = rte_get_timer_cycles();
#ifdef RTE_ARCH_X86_64
/* on 64-bit the value cached in the pending_head.expired will be
* updated atomically, so we can consult that for a quick check here
* outside the lock */
if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
return;
#endif
/* browse ordered list, add expired timers in 'expired' list */
rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
/* if nothing to do just unlock and return */
if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
return;
}
/* save start of list of expired timers */
tim = priv_timer[lcore_id].pending_head.sl_next[0];
/* break the existing list at current time point */
timer_get_prev_entries(cur_time, lcore_id, prev);
for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
priv_timer[lcore_id].pending_head.sl_next[i] =
prev[i]->sl_next[i];
if (prev[i]->sl_next[i] == NULL)
priv_timer[lcore_id].curr_skiplist_depth--;
prev[i] ->sl_next[i] = NULL;
}
/* transition run-list from PENDING to RUNNING */
run_first_tim = tim;
pprev = &run_first_tim;
for ( ; tim != NULL; tim = next_tim) {
next_tim = tim->sl_next[0];
ret = timer_set_running_state(tim);
if (likely(ret == 0)) {
pprev = &tim->sl_next[0];
} else {
/* another core is trying to re-config this one,
* remove it from local expired list and put it
* back on the priv_timer[] skip list */
*pprev = next_tim;
timer_add(tim, lcore_id, 1);
}
}
/* update the next to expire timer value */
priv_timer[lcore_id].pending_head.expire =
(priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
priv_timer[lcore_id].pending_head.sl_next[0]->expire;
rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
/* now scan expired list and call callbacks */
for (tim = run_first_tim; tim != NULL; tim = next_tim) {
next_tim = tim->sl_next[0];
priv_timer[lcore_id].updated = 0;
/* execute callback function with list unlocked */
tim->f(tim, tim->arg);
__TIMER_STAT_ADD(pending, -1);
/* the timer was stopped or reloaded by the callback
* function, we have nothing to do here */
if (priv_timer[lcore_id].updated == 1)
continue;
if (tim->period == 0) {
/* remove from done list and mark timer as stopped */
status.state = RTE_TIMER_STOP;
status.owner = RTE_TIMER_NO_OWNER;
rte_wmb();
tim->status.u32 = status.u32;
}
else {
/* keep it in list and mark timer as pending */
rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
status.state = RTE_TIMER_PENDING;
__TIMER_STAT_ADD(pending, 1);
status.owner = (int16_t)lcore_id;
rte_wmb();
tim->status.u32 = status.u32;
__rte_timer_reset(tim, cur_time + tim->period,
tim->period, lcore_id, tim->f, tim->arg, 1);
rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
}
}
}
/* dump statistics about timers */
void rte_timer_dump_stats(FILE *f)
{
#ifdef RTE_LIBRTE_TIMER_DEBUG
struct rte_timer_debug_stats sum;
unsigned lcore_id;
memset(&sum, 0, sizeof(sum));
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
sum.reset += priv_timer[lcore_id].stats.reset;
sum.stop += priv_timer[lcore_id].stats.stop;
sum.manage += priv_timer[lcore_id].stats.manage;
sum.pending += priv_timer[lcore_id].stats.pending;
}
fprintf(f, "Timer statistics:\n");
fprintf(f, " reset = %"PRIu64"\n", sum.reset);
fprintf(f, " stop = %"PRIu64"\n", sum.stop);
fprintf(f, " manage = %"PRIu64"\n", sum.manage);
fprintf(f, " pending = %"PRIu64"\n", sum.pending);
#else
fprintf(f, "No timer statistics, RTE_LIBRTE_TIMER_DEBUG is disabled\n");
#endif
}