numam-dpdk/lib/vhost/iotlb.c
David Marchand f8f6b1c174 vhost: stop using mempool for IOTLB cache
A mempool consumes 3 memzones (with the default ring mempool driver).
The default DPDK configuration allows RTE_MAX_MEMZONE (2560) memzones.

Assuming there is no other memzones that means that we can have a
maximum of 853 mempools.

In the vhost library, the IOTLB cache code so far was requesting a
mempool per vq, which means that at the maximum, the vhost library
could request mempools for 426 qps.

This limit was recently reached on big systems with a lot of virtio
ports (and multiqueue in use).

While the limit on mempool count could be something we fix at the DPDK
project level, there is no reason to use mempools for the IOTLB cache:
- the IOTLB cache entries do not need to be DMA-able and are only used
  by the current process (in multiprocess context),
- getting/putting objects from/in the mempool is always associated with
  some other locks, so some level of lock contention is already present,

We can convert to a malloc'd pool with objects put in a free list
protected by a spinlock.

Signed-off-by: David Marchand <david.marchand@redhat.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
2022-09-15 17:56:24 +02:00

365 lines
8.2 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (c) 2017 Red Hat, Inc.
*/
#ifdef RTE_LIBRTE_VHOST_NUMA
#include <numaif.h>
#endif
#include <rte_tailq.h>
#include "iotlb.h"
#include "vhost.h"
struct vhost_iotlb_entry {
TAILQ_ENTRY(vhost_iotlb_entry) next;
SLIST_ENTRY(vhost_iotlb_entry) next_free;
uint64_t iova;
uint64_t uaddr;
uint64_t size;
uint8_t perm;
};
#define IOTLB_CACHE_SIZE 2048
static struct vhost_iotlb_entry *
vhost_user_iotlb_pool_get(struct vhost_virtqueue *vq)
{
struct vhost_iotlb_entry *node;
rte_spinlock_lock(&vq->iotlb_free_lock);
node = SLIST_FIRST(&vq->iotlb_free_list);
if (node != NULL)
SLIST_REMOVE_HEAD(&vq->iotlb_free_list, next_free);
rte_spinlock_unlock(&vq->iotlb_free_lock);
return node;
}
static void
vhost_user_iotlb_pool_put(struct vhost_virtqueue *vq,
struct vhost_iotlb_entry *node)
{
rte_spinlock_lock(&vq->iotlb_free_lock);
SLIST_INSERT_HEAD(&vq->iotlb_free_list, node, next_free);
rte_spinlock_unlock(&vq->iotlb_free_lock);
}
static void
vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq);
static void
vhost_user_iotlb_pending_remove_all(struct vhost_virtqueue *vq)
{
struct vhost_iotlb_entry *node, *temp_node;
rte_rwlock_write_lock(&vq->iotlb_pending_lock);
RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next, temp_node) {
TAILQ_REMOVE(&vq->iotlb_pending_list, node, next);
vhost_user_iotlb_pool_put(vq, node);
}
rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
}
bool
vhost_user_iotlb_pending_miss(struct vhost_virtqueue *vq, uint64_t iova,
uint8_t perm)
{
struct vhost_iotlb_entry *node;
bool found = false;
rte_rwlock_read_lock(&vq->iotlb_pending_lock);
TAILQ_FOREACH(node, &vq->iotlb_pending_list, next) {
if ((node->iova == iova) && (node->perm == perm)) {
found = true;
break;
}
}
rte_rwlock_read_unlock(&vq->iotlb_pending_lock);
return found;
}
void
vhost_user_iotlb_pending_insert(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint64_t iova, uint8_t perm)
{
struct vhost_iotlb_entry *node;
node = vhost_user_iotlb_pool_get(vq);
if (node == NULL) {
VHOST_LOG_CONFIG(dev->ifname, DEBUG,
"IOTLB pool for vq %"PRIu32" empty, clear entries for pending insertion\n",
vq->index);
if (!TAILQ_EMPTY(&vq->iotlb_pending_list))
vhost_user_iotlb_pending_remove_all(vq);
else
vhost_user_iotlb_cache_random_evict(vq);
node = vhost_user_iotlb_pool_get(vq);
if (node == NULL) {
VHOST_LOG_CONFIG(dev->ifname, ERR,
"IOTLB pool vq %"PRIu32" still empty, pending insertion failure\n",
vq->index);
return;
}
}
node->iova = iova;
node->perm = perm;
rte_rwlock_write_lock(&vq->iotlb_pending_lock);
TAILQ_INSERT_TAIL(&vq->iotlb_pending_list, node, next);
rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
}
void
vhost_user_iotlb_pending_remove(struct vhost_virtqueue *vq,
uint64_t iova, uint64_t size, uint8_t perm)
{
struct vhost_iotlb_entry *node, *temp_node;
rte_rwlock_write_lock(&vq->iotlb_pending_lock);
RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_pending_list, next,
temp_node) {
if (node->iova < iova)
continue;
if (node->iova >= iova + size)
continue;
if ((node->perm & perm) != node->perm)
continue;
TAILQ_REMOVE(&vq->iotlb_pending_list, node, next);
vhost_user_iotlb_pool_put(vq, node);
}
rte_rwlock_write_unlock(&vq->iotlb_pending_lock);
}
static void
vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue *vq)
{
struct vhost_iotlb_entry *node, *temp_node;
rte_rwlock_write_lock(&vq->iotlb_lock);
RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
TAILQ_REMOVE(&vq->iotlb_list, node, next);
vhost_user_iotlb_pool_put(vq, node);
}
vq->iotlb_cache_nr = 0;
rte_rwlock_write_unlock(&vq->iotlb_lock);
}
static void
vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue *vq)
{
struct vhost_iotlb_entry *node, *temp_node;
int entry_idx;
rte_rwlock_write_lock(&vq->iotlb_lock);
entry_idx = rte_rand() % vq->iotlb_cache_nr;
RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
if (!entry_idx) {
TAILQ_REMOVE(&vq->iotlb_list, node, next);
vhost_user_iotlb_pool_put(vq, node);
vq->iotlb_cache_nr--;
break;
}
entry_idx--;
}
rte_rwlock_write_unlock(&vq->iotlb_lock);
}
void
vhost_user_iotlb_cache_insert(struct virtio_net *dev, struct vhost_virtqueue *vq,
uint64_t iova, uint64_t uaddr,
uint64_t size, uint8_t perm)
{
struct vhost_iotlb_entry *node, *new_node;
new_node = vhost_user_iotlb_pool_get(vq);
if (new_node == NULL) {
VHOST_LOG_CONFIG(dev->ifname, DEBUG,
"IOTLB pool vq %"PRIu32" empty, clear entries for cache insertion\n",
vq->index);
if (!TAILQ_EMPTY(&vq->iotlb_list))
vhost_user_iotlb_cache_random_evict(vq);
else
vhost_user_iotlb_pending_remove_all(vq);
new_node = vhost_user_iotlb_pool_get(vq);
if (new_node == NULL) {
VHOST_LOG_CONFIG(dev->ifname, ERR,
"IOTLB pool vq %"PRIu32" still empty, cache insertion failed\n",
vq->index);
return;
}
}
new_node->iova = iova;
new_node->uaddr = uaddr;
new_node->size = size;
new_node->perm = perm;
rte_rwlock_write_lock(&vq->iotlb_lock);
TAILQ_FOREACH(node, &vq->iotlb_list, next) {
/*
* Entries must be invalidated before being updated.
* So if iova already in list, assume identical.
*/
if (node->iova == new_node->iova) {
vhost_user_iotlb_pool_put(vq, new_node);
goto unlock;
} else if (node->iova > new_node->iova) {
TAILQ_INSERT_BEFORE(node, new_node, next);
vq->iotlb_cache_nr++;
goto unlock;
}
}
TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next);
vq->iotlb_cache_nr++;
unlock:
vhost_user_iotlb_pending_remove(vq, iova, size, perm);
rte_rwlock_write_unlock(&vq->iotlb_lock);
}
void
vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq,
uint64_t iova, uint64_t size)
{
struct vhost_iotlb_entry *node, *temp_node;
if (unlikely(!size))
return;
rte_rwlock_write_lock(&vq->iotlb_lock);
RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
/* Sorted list */
if (unlikely(iova + size < node->iova))
break;
if (iova < node->iova + node->size) {
TAILQ_REMOVE(&vq->iotlb_list, node, next);
vhost_user_iotlb_pool_put(vq, node);
vq->iotlb_cache_nr--;
}
}
rte_rwlock_write_unlock(&vq->iotlb_lock);
}
uint64_t
vhost_user_iotlb_cache_find(struct vhost_virtqueue *vq, uint64_t iova,
uint64_t *size, uint8_t perm)
{
struct vhost_iotlb_entry *node;
uint64_t offset, vva = 0, mapped = 0;
if (unlikely(!*size))
goto out;
TAILQ_FOREACH(node, &vq->iotlb_list, next) {
/* List sorted by iova */
if (unlikely(iova < node->iova))
break;
if (iova >= node->iova + node->size)
continue;
if (unlikely((perm & node->perm) != perm)) {
vva = 0;
break;
}
offset = iova - node->iova;
if (!vva)
vva = node->uaddr + offset;
mapped += node->size - offset;
iova = node->iova + node->size;
if (mapped >= *size)
break;
}
out:
/* Only part of the requested chunk is mapped */
if (unlikely(mapped < *size))
*size = mapped;
return vva;
}
void
vhost_user_iotlb_flush_all(struct vhost_virtqueue *vq)
{
vhost_user_iotlb_cache_remove_all(vq);
vhost_user_iotlb_pending_remove_all(vq);
}
int
vhost_user_iotlb_init(struct virtio_net *dev, struct vhost_virtqueue *vq)
{
unsigned int i;
int socket = 0;
if (vq->iotlb_pool) {
/*
* The cache has already been initialized,
* just drop all cached and pending entries.
*/
vhost_user_iotlb_flush_all(vq);
rte_free(vq->iotlb_pool);
}
#ifdef RTE_LIBRTE_VHOST_NUMA
if (get_mempolicy(&socket, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR) != 0)
socket = 0;
#endif
rte_spinlock_init(&vq->iotlb_free_lock);
rte_rwlock_init(&vq->iotlb_lock);
rte_rwlock_init(&vq->iotlb_pending_lock);
SLIST_INIT(&vq->iotlb_free_list);
TAILQ_INIT(&vq->iotlb_list);
TAILQ_INIT(&vq->iotlb_pending_list);
vq->iotlb_pool = rte_calloc_socket("iotlb", IOTLB_CACHE_SIZE,
sizeof(struct vhost_iotlb_entry), 0, socket);
if (!vq->iotlb_pool) {
VHOST_LOG_CONFIG(dev->ifname, ERR,
"Failed to create IOTLB cache pool for vq %"PRIu32"\n",
vq->index);
return -1;
}
for (i = 0; i < IOTLB_CACHE_SIZE; i++)
vhost_user_iotlb_pool_put(vq, &vq->iotlb_pool[i]);
vq->iotlb_cache_nr = 0;
return 0;
}
void
vhost_user_iotlb_destroy(struct vhost_virtqueue *vq)
{
rte_free(vq->iotlb_pool);
}