numam-dpdk/lib/eal/x86/rte_power_intrinsics.c
Anatoly Burakov 66834f2974 eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Tested-by: David Hunt <david.hunt@intel.com>
2021-07-09 21:13:13 +02:00

288 lines
7.1 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2020 Intel Corporation
*/
#include <rte_common.h>
#include <rte_lcore.h>
#include <rte_rtm.h>
#include <rte_spinlock.h>
#include "rte_power_intrinsics.h"
/*
* Per-lcore structure holding current status of C0.2 sleeps.
*/
static struct power_wait_status {
rte_spinlock_t lock;
volatile void *monitor_addr; /**< NULL if not currently sleeping */
} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
static inline void
__umwait_wakeup(volatile void *addr)
{
uint64_t val;
/* trigger a write but don't change the value */
val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
static bool wait_supported;
static bool wait_multi_supported;
static inline uint64_t
__get_umwait_val(const volatile void *p, const uint8_t sz)
{
switch (sz) {
case sizeof(uint8_t):
return *(const volatile uint8_t *)p;
case sizeof(uint16_t):
return *(const volatile uint16_t *)p;
case sizeof(uint32_t):
return *(const volatile uint32_t *)p;
case sizeof(uint64_t):
return *(const volatile uint64_t *)p;
default:
/* shouldn't happen */
RTE_ASSERT(0);
return 0;
}
}
static inline int
__check_val_size(const uint8_t sz)
{
switch (sz) {
case sizeof(uint8_t): /* fall-through */
case sizeof(uint16_t): /* fall-through */
case sizeof(uint32_t): /* fall-through */
case sizeof(uint64_t): /* fall-through */
return 0;
default:
/* unexpected size */
return -1;
}
}
/**
* This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
* For more information about usage of these instructions, please refer to
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
*/
int
rte_power_monitor(const struct rte_power_monitor_cond *pmc,
const uint64_t tsc_timestamp)
{
const uint32_t tsc_l = (uint32_t)tsc_timestamp;
const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
const unsigned int lcore_id = rte_lcore_id();
struct power_wait_status *s;
uint64_t cur_value;
/* prevent user from running this instruction if it's not supported */
if (!wait_supported)
return -ENOTSUP;
/* prevent non-EAL thread from using this API */
if (lcore_id >= RTE_MAX_LCORE)
return -EINVAL;
if (pmc == NULL)
return -EINVAL;
if (__check_val_size(pmc->size) < 0)
return -EINVAL;
if (pmc->fn == NULL)
return -EINVAL;
s = &wait_status[lcore_id];
/* update sleep address */
rte_spinlock_lock(&s->lock);
s->monitor_addr = pmc->addr;
/*
* we're using raw byte codes for now as only the newest compiler
* versions support this instruction natively.
*/
/* set address for UMONITOR */
asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;"
:
: "D"(pmc->addr));
/* now that we've put this address into monitor, we can unlock */
rte_spinlock_unlock(&s->lock);
cur_value = __get_umwait_val(pmc->addr, pmc->size);
/* check if callback indicates we should abort */
if (pmc->fn(cur_value, pmc->opaque) != 0)
goto end;
/* execute UMWAIT */
asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
: /* ignore rflags */
: "D"(0), /* enter C0.2 */
"a"(tsc_l), "d"(tsc_h));
end:
/* erase sleep address */
rte_spinlock_lock(&s->lock);
s->monitor_addr = NULL;
rte_spinlock_unlock(&s->lock);
return 0;
}
/**
* This function uses TPAUSE instruction and will enter C0.2 state. For more
* information about usage of this instruction, please refer to Intel(R) 64 and
* IA-32 Architectures Software Developer's Manual.
*/
int
rte_power_pause(const uint64_t tsc_timestamp)
{
const uint32_t tsc_l = (uint32_t)tsc_timestamp;
const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
/* prevent user from running this instruction if it's not supported */
if (!wait_supported)
return -ENOTSUP;
/* execute TPAUSE */
asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;"
: /* ignore rflags */
: "D"(0), /* enter C0.2 */
"a"(tsc_l), "d"(tsc_h));
return 0;
}
RTE_INIT(rte_power_intrinsics_init) {
struct rte_cpu_intrinsics i;
rte_cpu_get_intrinsics_support(&i);
if (i.power_monitor && i.power_pause)
wait_supported = 1;
if (i.power_monitor_multi)
wait_multi_supported = 1;
}
int
rte_power_monitor_wakeup(const unsigned int lcore_id)
{
struct power_wait_status *s;
/* prevent user from running this instruction if it's not supported */
if (!wait_supported)
return -ENOTSUP;
/* prevent buffer overrun */
if (lcore_id >= RTE_MAX_LCORE)
return -EINVAL;
s = &wait_status[lcore_id];
/*
* There is a race condition between sleep, wakeup and locking, but we
* don't need to handle it.
*
* Possible situations:
*
* 1. T1 locks, sets address, unlocks
* 2. T2 locks, triggers wakeup, unlocks
* 3. T1 sleeps
*
* In this case, because T1 has already set the address for monitoring,
* we will wake up immediately even if T2 triggers wakeup before T1
* goes to sleep.
*
* 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
* 2. T2 locks, triggers wakeup, and unlocks
* 3. T1 locks, erases address, and unlocks
*
* In this case, since we've already woken up, the "wakeup" was
* unneeded, and since T1 is still waiting on T2 releasing the lock, the
* wakeup address is still valid so it's perfectly safe to write it.
*
* For multi-monitor case, the act of locking will in itself trigger the
* wakeup, so no additional writes necessary.
*/
rte_spinlock_lock(&s->lock);
if (s->monitor_addr != NULL)
__umwait_wakeup(s->monitor_addr);
rte_spinlock_unlock(&s->lock);
return 0;
}
int
rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
const uint32_t num, const uint64_t tsc_timestamp)
{
const unsigned int lcore_id = rte_lcore_id();
struct power_wait_status *s = &wait_status[lcore_id];
uint32_t i, rc;
/* check if supported */
if (!wait_multi_supported)
return -ENOTSUP;
if (pmc == NULL || num == 0)
return -EINVAL;
/* we are already inside transaction region, return */
if (rte_xtest() != 0)
return 0;
/* start new transaction region */
rc = rte_xbegin();
/* transaction abort, possible write to one of wait addresses */
if (rc != RTE_XBEGIN_STARTED)
return 0;
/*
* the mere act of reading the lock status here adds the lock to
* the read set. This means that when we trigger a wakeup from another
* thread, even if we don't have a defined wakeup address and thus don't
* actually cause any writes, the act of locking our lock will itself
* trigger the wakeup and abort the transaction.
*/
rte_spinlock_is_locked(&s->lock);
/*
* add all addresses to wait on into transaction read-set and check if
* any of wakeup conditions are already met.
*/
rc = 0;
for (i = 0; i < num; i++) {
const struct rte_power_monitor_cond *c = &pmc[i];
/* cannot be NULL */
if (c->fn == NULL) {
rc = -EINVAL;
break;
}
const uint64_t val = __get_umwait_val(c->addr, c->size);
/* abort if callback indicates that we need to stop */
if (c->fn(val, c->opaque) != 0)
break;
}
/* none of the conditions were met, sleep until timeout */
if (i == num)
rte_power_pause(tsc_timestamp);
/* end transaction region */
rte_xend();
return rc;
}