From e48813009c6adbc8b461d24074ab3e926d9ae3df Mon Sep 17 00:00:00 2001
From: Hans Petter Selasky <hselasky@FreeBSD.org>
Date: Thu, 30 Jan 2020 12:35:13 +0000
Subject: [PATCH] Widen EPOCH(9) usage in mlx5en(4).

Make completion event path mostly lockless using EPOCH(9).

Implement a mechanism using EPOCH(9) which allows us to make
the callback path for completion events mostly lockless.

Simplify draining callback events using epoch_wait().

While at it make sure all receive completion callbacks are
covered by the network EPOCH(9), because this is required
when calling if_input() and ether_input() after r357012.

Sponsored by:	Mellanox Technologies
---
 sys/dev/mlx5/cq.h                   |   2 -
 sys/dev/mlx5/driver.h               |   8 +-
 sys/dev/mlx5/mlx5_core/mlx5_cq.c    | 174 +++++++++++++---------------
 sys/dev/mlx5/mlx5_en/mlx5_en_main.c |   9 ++
 4 files changed, 91 insertions(+), 102 deletions(-)

diff --git a/sys/dev/mlx5/cq.h b/sys/dev/mlx5/cq.h
index 161fbe13001a..9e22f8dbd497 100644
--- a/sys/dev/mlx5/cq.h
+++ b/sys/dev/mlx5/cq.h
@@ -38,8 +38,6 @@ struct mlx5_core_cq {
 	int			cqe_sz;
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
-	atomic_t		refcount;
-	struct completion	free;
 	unsigned		vector;
 	int			irqn;
 	void (*comp)		(struct mlx5_core_cq *);
diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h
index 5ea37307b8b6..e2a68725751b 100644
--- a/sys/dev/mlx5/driver.h
+++ b/sys/dev/mlx5/driver.h
@@ -514,21 +514,17 @@ struct mlx5_core_health {
 	struct workqueue_struct	       *wq_cmd;
 };
 
-#ifdef RATELIMIT
-#define	MLX5_CQ_LINEAR_ARRAY_SIZE	(128 * 1024)
-#else
 #define	MLX5_CQ_LINEAR_ARRAY_SIZE	1024
-#endif
 
 struct mlx5_cq_linear_array_entry {
-	spinlock_t	lock;
 	struct mlx5_core_cq * volatile cq;
 };
 
 struct mlx5_cq_table {
 	/* protect radix tree
 	 */
-	spinlock_t		lock;
+	spinlock_t		writerlock;
+	atomic_t		writercount;
 	struct radix_tree_root	tree;
 	struct mlx5_cq_linear_array_entry linear_array[MLX5_CQ_LINEAR_ARRAY_SIZE];
 };
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_cq.c b/sys/dev/mlx5/mlx5_core/mlx5_cq.c
index cd2093924242..4cbc4eb61377 100644
--- a/sys/dev/mlx5/mlx5_core/mlx5_cq.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_cq.c
@@ -33,72 +33,91 @@
 #include <dev/mlx5/cq.h>
 #include "mlx5_core.h"
 
+#include <sys/epoch.h>
+
+static void
+mlx5_cq_table_write_lock(struct mlx5_cq_table *table)
+{
+
+	atomic_inc(&table->writercount);
+	/* make sure all see the updated writercount */
+	NET_EPOCH_WAIT();
+	spin_lock(&table->writerlock);
+}
+
+static void
+mlx5_cq_table_write_unlock(struct mlx5_cq_table *table)
+{
+
+	spin_unlock(&table->writerlock);
+	atomic_dec(&table->writercount);
+	/* drain all pending CQ callers */
+	NET_EPOCH_WAIT();
+}
+
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
 {
-	struct mlx5_core_cq *cq;
 	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	struct mlx5_core_cq *cq;
+	struct epoch_tracker et;
+	bool do_lock;
 
-	if (cqn < MLX5_CQ_LINEAR_ARRAY_SIZE) {
-		struct mlx5_cq_linear_array_entry *entry;
+	NET_EPOCH_ENTER(et);
 
-		entry = &table->linear_array[cqn];
-		spin_lock(&entry->lock);
-		cq = entry->cq;
-		if (cq == NULL) {
-			mlx5_core_warn(dev,
-			    "Completion event for bogus CQ 0x%x\n", cqn);
-		} else {
-			++cq->arm_sn;
-			cq->comp(cq);
-		}
-		spin_unlock(&entry->lock);
-		return;
+	do_lock = atomic_read(&table->writercount) != 0;
+	if (unlikely(do_lock))
+		spin_lock(&table->writerlock);
+
+	if (likely(cqn < MLX5_CQ_LINEAR_ARRAY_SIZE))
+		cq = table->linear_array[cqn].cq;
+	else
+		cq = radix_tree_lookup(&table->tree, cqn);
+
+	if (unlikely(do_lock))
+		spin_unlock(&table->writerlock);
+
+	if (likely(cq != NULL)) {
+		++cq->arm_sn;
+		cq->comp(cq);
+	} else {
+		mlx5_core_warn(dev,
+		    "Completion event for bogus CQ 0x%x\n", cqn);
 	}
 
-	spin_lock(&table->lock);
-	cq = radix_tree_lookup(&table->tree, cqn);
-	if (likely(cq))
-		atomic_inc(&cq->refcount);
-	spin_unlock(&table->lock);
-
-	if (!cq) {
-		mlx5_core_warn(dev, "Completion event for bogus CQ 0x%x\n", cqn);
-		return;
-	}
-
-	++cq->arm_sn;
-
-	cq->comp(cq);
-
-	if (atomic_dec_and_test(&cq->refcount))
-		complete(&cq->free);
+	NET_EPOCH_EXIT(et);
 }
 
 void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
 {
 	struct mlx5_cq_table *table = &dev->priv.cq_table;
 	struct mlx5_core_cq *cq;
+	struct epoch_tracker et;
+	bool do_lock;
 
-	spin_lock(&table->lock);
+	NET_EPOCH_ENTER(et);
 
-	cq = radix_tree_lookup(&table->tree, cqn);
-	if (cq)
-		atomic_inc(&cq->refcount);
+	do_lock = atomic_read(&table->writercount) != 0;
+	if (unlikely(do_lock))
+		spin_lock(&table->writerlock);
 
-	spin_unlock(&table->lock);
+	if (likely(cqn < MLX5_CQ_LINEAR_ARRAY_SIZE))
+		cq = table->linear_array[cqn].cq;
+	else
+		cq = radix_tree_lookup(&table->tree, cqn);
 
-	if (!cq) {
-		mlx5_core_warn(dev, "Async event for bogus CQ 0x%x\n", cqn);
-		return;
+	if (unlikely(do_lock))
+		spin_unlock(&table->writerlock);
+
+	if (likely(cq != NULL)) {
+		cq->event(cq, event_type);
+	} else {
+		mlx5_core_warn(dev,
+		    "Asynchronous event for bogus CQ 0x%x\n", cqn);
 	}
 
-	cq->event(cq, event_type);
-
-	if (atomic_dec_and_test(&cq->refcount))
-		complete(&cq->free);
+	NET_EPOCH_EXIT(et);
 }
 
-
 int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 			u32 *in, int inlen)
 {
@@ -116,24 +135,16 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	cq->cqn = MLX5_GET(create_cq_out, out, cqn);
 	cq->cons_index = 0;
 	cq->arm_sn     = 0;
-	atomic_set(&cq->refcount, 1);
-	init_completion(&cq->free);
 
-	spin_lock_irq(&table->lock);
+	mlx5_cq_table_write_lock(table);
 	err = radix_tree_insert(&table->tree, cq->cqn, cq);
-	spin_unlock_irq(&table->lock);
+	if (likely(err == 0 && cq->cqn < MLX5_CQ_LINEAR_ARRAY_SIZE))
+		table->linear_array[cq->cqn].cq = cq;
+	mlx5_cq_table_write_unlock(table);
+
 	if (err)
 		goto err_cmd;
 
-	if (cq->cqn < MLX5_CQ_LINEAR_ARRAY_SIZE) {
-		struct mlx5_cq_linear_array_entry *entry;
-
-		entry = &table->linear_array[cq->cqn];
-		spin_lock_irq(&entry->lock);
-		entry->cq = cq;
-		spin_unlock_irq(&entry->lock);
-	}
-
 	cq->pid = curthread->td_proc->p_pid;
 
 	return 0;
@@ -152,44 +163,24 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
 	u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {0};
 	struct mlx5_core_cq *tmp;
-	int err;
 
-	if (cq->cqn < MLX5_CQ_LINEAR_ARRAY_SIZE) {
-		struct mlx5_cq_linear_array_entry *entry;
-
-		entry = &table->linear_array[cq->cqn];
-		spin_lock_irq(&entry->lock);
-		entry->cq = NULL;
-		spin_unlock_irq(&entry->lock);
-	}
-
-	spin_lock_irq(&table->lock);
+	mlx5_cq_table_write_lock(table);
+	if (likely(cq->cqn < MLX5_CQ_LINEAR_ARRAY_SIZE))
+		table->linear_array[cq->cqn].cq = NULL;
 	tmp = radix_tree_delete(&table->tree, cq->cqn);
-	spin_unlock_irq(&table->lock);
-	if (!tmp) {
+	mlx5_cq_table_write_unlock(table);
+
+	if (unlikely(tmp == NULL)) {
 		mlx5_core_warn(dev, "cq 0x%x not found in tree\n", cq->cqn);
 		return -EINVAL;
-	}
-	if (tmp != cq) {
-		mlx5_core_warn(dev, "corruption on srqn 0x%x\n", cq->cqn);
+	} else if (unlikely(tmp != cq)) {
+		mlx5_core_warn(dev, "corrupted cqn 0x%x\n", cq->cqn);
 		return -EINVAL;
 	}
 
 	MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ);
 	MLX5_SET(destroy_cq_in, in, cqn, cq->cqn);
-	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-	if (err)
-		goto out;
-
-	synchronize_irq(cq->irqn);
-
-	if (atomic_dec_and_test(&cq->refcount))
-		complete(&cq->free);
-	wait_for_completion(&cq->free);
-
-out:
-
-	return err;
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 EXPORT_SYMBOL(mlx5_core_destroy_cq);
 
@@ -259,17 +250,12 @@ int mlx5_core_modify_cq_moderation_mode(struct mlx5_core_dev *dev,
 int mlx5_init_cq_table(struct mlx5_core_dev *dev)
 {
 	struct mlx5_cq_table *table = &dev->priv.cq_table;
-	int err;
-	int x;
 
 	memset(table, 0, sizeof(*table));
-	spin_lock_init(&table->lock);
-	for (x = 0; x != MLX5_CQ_LINEAR_ARRAY_SIZE; x++)
-		spin_lock_init(&table->linear_array[x].lock);
+	spin_lock_init(&table->writerlock);
 	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
-	err = 0;
 
-	return err;
+	return 0;
 }
 
 void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev)
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index 2c8b46af12cd..504f6c01591e 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -2182,6 +2182,7 @@ mlx5e_open_channel(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam,
     struct mlx5e_channel *c)
 {
+	struct epoch_tracker et;
 	int i, err;
 
 	/* zero non-persistant data */
@@ -2209,7 +2210,9 @@ mlx5e_open_channel(struct mlx5e_priv *priv,
 		goto err_close_sqs;
 
 	/* poll receive queue initially */
+	NET_EPOCH_ENTER(et);
 	c->rq.cq.mcq.comp(&c->rq.cq.mcq);
+	NET_EPOCH_EXIT(et);
 
 	return (0);
 
@@ -3746,6 +3749,7 @@ static void
 mlx5e_disable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
+	struct epoch_tracker et;
 	int err;
 
 	mtx_lock(&rq->mtx);
@@ -3761,7 +3765,9 @@ mlx5e_disable_rx_dma(struct mlx5e_channel *ch)
 
 	while (!mlx5_wq_ll_is_empty(&rq->wq)) {
 		msleep(1);
+		NET_EPOCH_ENTER(et);
 		rq->cq.mcq.comp(&rq->cq.mcq);
+		NET_EPOCH_EXIT(et);
 	}
 
 	/*
@@ -3779,6 +3785,7 @@ static void
 mlx5e_enable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
+	struct epoch_tracker et;
 	int err;
 
 	rq->wq.wqe_ctr = 0;
@@ -3791,7 +3798,9 @@ mlx5e_enable_rx_dma(struct mlx5e_channel *ch)
 
 	rq->enabled = 1;
 
+	NET_EPOCH_ENTER(et);
 	rq->cq.mcq.comp(&rq->cq.mcq);
+	NET_EPOCH_EXIT(et);
 }
 
 void