From f472a0a088b63a3b41b843df8d59f2d1394fb6bf Mon Sep 17 00:00:00 2001 From: jimharris Date: Fri, 30 Oct 2015 16:06:34 +0000 Subject: [PATCH] nvme: fix race condition in split bio completion path Fixes race condition observed under following circumstances: 1) I/O split on 128KB boundary with Intel NVMe controller. Current Intel controllers produce better latency when I/Os do not span a 128KB boundary - even if the I/O size itself is less than 128KB. 2) Per-CPU I/O queues are enabled. 3) Child I/Os are submitted on different submission queues. 4) Interrupts for child I/O completions occur almost simultaneously. 5) ithread for child I/O A increments bio_inbed, then immediately is preempted (rendezvous IPI, higher priority interrupt). 6) ithread for child I/O B increments bio_inbed, then completes parent bio since all children are now completed. 7) parent bio is freed, and immediately reallocated for a VFS or gpart bio (including setting bio_children to 1 and clearing bio_driver1). 8) ithread for child I/O A resumes processing. bio_children for what it thinks is the parent bio is set to 1, so it thinks it needs to complete the parent bio. Result is either calling a NULL callback function, or double freeing the bio to its uma zone. PR: 203746 Reported by: Drew Gallatin , Marc Goroff Tested by: Drew Gallatin MFC after: 3 days Sponsored by: Intel --- sys/dev/nvme/nvme_ns.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c index 84a19805e495..c6a00df6b884 100644 --- a/sys/dev/nvme/nvme_ns.c +++ b/sys/dev/nvme/nvme_ns.c @@ -239,7 +239,7 @@ static void nvme_bio_child_inbed(struct bio *parent, int bio_error) { struct nvme_completion parent_cpl; - int inbed; + int children, inbed; if (bio_error != 0) { parent->bio_flags |= BIO_ERROR; @@ -248,10 +248,13 @@ nvme_bio_child_inbed(struct bio *parent, int bio_error) /* * atomic_fetchadd will return value before adding 1, so we still - * must add 1 to get the updated inbed number. + * must add 1 to get the updated inbed number. Save bio_children + * before incrementing to guard against race conditions when + * two children bios complete on different queues. */ + children = atomic_load_acq_int(&parent->bio_children); inbed = atomic_fetchadd_int(&parent->bio_inbed, 1) + 1; - if (inbed == parent->bio_children) { + if (inbed == children) { bzero(&parent_cpl, sizeof(parent_cpl)); if (parent->bio_flags & BIO_ERROR) parent_cpl.status.sc = NVME_SC_DATA_TRANSFER_ERROR;