bdev/rbd: Remove epoll based group polling mechanism.

We know that the librbd's read/write operations will be handled by a
non spdk thread, so we can get rid of the epoll based group based
polling and directly use the async completion. This makes the code
is simple and easy to maintain.

And we still need to keep the io_device registration for this module,
because the I/Os are async. We need the channel reference on "rbd_if"
in order to know which rbd disks are still active.

Change-Id: I1c140a4b286dbfe113ed2a67bd2875de605e8f24
Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9335
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
This commit is contained in:
Ziye Yang 2021-08-30 23:53:54 +08:00 committed by Tomasz Zawadzki
parent 845db70ccc
commit 62210eff55

View File

@ -37,8 +37,6 @@
#include <rbd/librbd.h>
#include <rados/librados.h>
#include <sys/eventfd.h>
#include <sys/epoll.h>
#include "spdk/env.h"
#include "spdk/bdev.h"
@ -51,9 +49,6 @@
#include "spdk/bdev_module.h"
#include "spdk/log.h"
#define SPDK_RBD_QUEUE_DEPTH 128
#define MAX_EVENTS_PER_POLL 128
static int bdev_rbd_count = 0;
struct bdev_rbd {
@ -69,25 +64,19 @@ struct bdev_rbd {
rados_ioctx_t io_ctx;
rbd_image_t image;
int pfd;
rbd_image_info_t info;
pthread_mutex_t mutex;
struct spdk_thread *main_td;
struct spdk_thread *destruct_td;
uint32_t ch_count;
struct bdev_rbd_group_channel *group_ch;
struct spdk_io_channel *group_ch;
TAILQ_ENTRY(bdev_rbd) tailq;
struct spdk_poller *reset_timer;
struct spdk_bdev_io *reset_bdev_io;
};
struct bdev_rbd_group_channel {
struct spdk_poller *poller;
int epoll_fd;
};
struct bdev_rbd_io_channel {
struct bdev_rbd *disk;
};
@ -379,12 +368,6 @@ bdev_rbd_exit(rbd_image_t image)
rbd_close(image);
}
static void
bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg)
{
/* Doing nothing here */
}
static void
_bdev_rbd_io_complete(void *_rbd_io)
{
@ -408,6 +391,35 @@ bdev_rbd_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status stat
}
}
static void
bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg)
{
int io_status;
struct spdk_bdev_io *bdev_io;
struct bdev_rbd_io *rbd_io;
enum spdk_bdev_io_status bio_status;
bdev_io = rbd_aio_get_arg(cb);
rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx;
io_status = rbd_aio_get_return_value(cb);
bio_status = SPDK_BDEV_IO_STATUS_SUCCESS;
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
if ((int)rbd_io->total_len != io_status) {
bio_status = SPDK_BDEV_IO_STATUS_FAILED;
}
} else {
/* For others, 0 means success */
if (io_status != 0) {
bio_status = SPDK_BDEV_IO_STATUS_FAILED;
}
}
rbd_aio_release(cb);
bdev_rbd_io_complete(bdev_io, bio_status);
}
static void
bdev_rbd_start_aio(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io,
struct iovec *iov, int iovcnt, uint64_t offset, size_t len)
@ -451,7 +463,6 @@ err:
}
static int bdev_rbd_library_init(void);
static void bdev_rbd_library_fini(void);
static int
@ -641,64 +652,18 @@ bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
}
}
static void
bdev_rbd_io_poll(struct bdev_rbd *disk)
{
int i, io_status, rc;
rbd_completion_t comps[SPDK_RBD_QUEUE_DEPTH];
struct spdk_bdev_io *bdev_io;
struct bdev_rbd_io *rbd_io;
enum spdk_bdev_io_status bio_status;
rc = rbd_poll_io_events(disk->image, comps, SPDK_RBD_QUEUE_DEPTH);
for (i = 0; i < rc; i++) {
bdev_io = rbd_aio_get_arg(comps[i]);
rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx;
io_status = rbd_aio_get_return_value(comps[i]);
bio_status = SPDK_BDEV_IO_STATUS_SUCCESS;
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
if ((int)rbd_io->total_len != io_status) {
bio_status = SPDK_BDEV_IO_STATUS_FAILED;
}
} else {
/* For others, 0 means success */
if (io_status != 0) {
bio_status = SPDK_BDEV_IO_STATUS_FAILED;
}
}
rbd_aio_release(comps[i]);
bdev_rbd_io_complete(bdev_io, bio_status);
}
}
static void
bdev_rbd_free_channel_resources(struct bdev_rbd *disk)
{
int rc;
assert(disk != NULL);
assert(disk->main_td == spdk_get_thread());
assert(disk->ch_count == 0);
assert(disk->group_ch != NULL);
rc = epoll_ctl(disk->group_ch->epoll_fd, EPOLL_CTL_DEL,
disk->pfd, NULL);
if (rc < 0) {
SPDK_ERRLOG("Failed to remove fd on disk=%p from the polling group=%p\n",
disk, disk->group_ch);
}
spdk_put_io_channel(spdk_io_channel_from_ctx(disk->group_ch));
spdk_put_io_channel(disk->group_ch);
if (disk->image) {
bdev_rbd_exit(disk->image);
}
if (disk->pfd >= 0) {
close(disk->pfd);
}
disk->main_td = NULL;
disk->group_ch = NULL;
}
@ -720,42 +685,15 @@ bdev_rbd_handle(void *arg)
static int
_bdev_rbd_create_cb(struct bdev_rbd *disk)
{
int ret;
struct epoll_event event = {};
disk->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&rbd_if));
disk->group_ch = spdk_get_io_channel(&rbd_if);
assert(disk->group_ch != NULL);
event.events = EPOLLIN;
event.data.ptr = disk;
if (spdk_call_unaffinitized(bdev_rbd_handle, disk) == NULL) {
goto err;
}
disk->pfd = eventfd(0, EFD_NONBLOCK);
if (disk->pfd < 0) {
SPDK_ERRLOG("Failed to get eventfd\n");
goto err;
}
ret = rbd_set_image_notification(disk->image, disk->pfd, EVENT_TYPE_EVENTFD);
if (ret < 0) {
SPDK_ERRLOG("Failed to set rbd image notification\n");
goto err;
}
ret = epoll_ctl(disk->group_ch->epoll_fd, EPOLL_CTL_ADD, disk->pfd, &event);
if (ret < 0) {
SPDK_ERRLOG("Failed to add the fd of disk=%p to the epoll group from group_ch=%p\n", disk,
disk->group_ch);
goto err;
bdev_rbd_free_channel_resources(disk);
return -1;
}
return 0;
err:
bdev_rbd_free_channel_resources(disk);
return -1;
}
static int
@ -1212,7 +1150,6 @@ bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id,
return ret;
}
rbd->pfd = -1;
rbd->rbd_name = strdup(rbd_name);
if (!rbd->rbd_name) {
bdev_rbd_free(rbd);
@ -1339,60 +1276,22 @@ bdev_rbd_resize(struct spdk_bdev *bdev, const uint64_t new_size_in_mb)
return rc;
}
static int
bdev_rbd_group_poll(void *arg)
{
struct bdev_rbd_group_channel *group_ch = arg;
struct epoll_event events[MAX_EVENTS_PER_POLL];
int num_events, i;
num_events = epoll_wait(group_ch->epoll_fd, events, MAX_EVENTS_PER_POLL, 0);
if (num_events <= 0) {
return SPDK_POLLER_IDLE;
}
for (i = 0; i < num_events; i++) {
bdev_rbd_io_poll((struct bdev_rbd *)events[i].data.ptr);
}
return SPDK_POLLER_BUSY;
}
static int
bdev_rbd_group_create_cb(void *io_device, void *ctx_buf)
{
struct bdev_rbd_group_channel *ch = ctx_buf;
ch->epoll_fd = epoll_create1(0);
if (ch->epoll_fd < 0) {
SPDK_ERRLOG("Could not create epoll fd on io device=%p\n", io_device);
return -1;
}
ch->poller = SPDK_POLLER_REGISTER(bdev_rbd_group_poll, ch, 0);
return 0;
}
static void
bdev_rbd_group_destroy_cb(void *io_device, void *ctx_buf)
{
struct bdev_rbd_group_channel *ch = ctx_buf;
if (ch->epoll_fd >= 0) {
close(ch->epoll_fd);
}
spdk_poller_unregister(&ch->poller);
}
static int
bdev_rbd_library_init(void)
{
spdk_io_device_register(&rbd_if, bdev_rbd_group_create_cb, bdev_rbd_group_destroy_cb,
sizeof(struct bdev_rbd_group_channel), "bdev_rbd_poll_groups");
0, "bdev_rbd_poll_groups");
return 0;
}