diff mbox series

[RFC,7/8] SUNRPC: Convert RQ_BUSY into a per-pool bitmap

Message ID 168806418985.1034990.14686512686720974159.stgit@morisot.1015granger.net (mailing list archive)
State New, archived
Headers show
Series RPC service thread scheduler optimizations | expand

Commit Message

Chuck Lever June 29, 2023, 6:43 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

I've noticed that server request latency goes up simply when the
nfsd thread count is increased.

List walking is known to be memory-inefficient. On a busy server
with many threads, enqueuing a transport will walk the "all threads"
list quite frequently. This also pulls in the cache lines for some
hot fields in each svc_rqst.

The svc_xprt_enqueue() call that concerns me most is the one in
svc_rdma_wc_receive(), which is single-threaded per CQ. Slowing
down completion handling will limit the total throughput per
RDMA connection.

So, avoid walking the "all threads" list to find an idle thread to
wake. Instead, set up an idle bitmap and use find_next_bit, which
should work the same way as RQ_BUSY but it will touch only the
cacheline that the bitmap is in. I think we can stick with atomic
bit operations here to avoid taking the pool lock.

The server can keep track of up to 64 threads in just one unsigned
long, and the bitmap can be multiple words long to handle even more
threads.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h    |    6 ++++--
 include/trace/events/sunrpc.h |    1 -
 net/sunrpc/svc.c              |   38 ++++++++++++++++++++++++++------------
 net/sunrpc/svc_xprt.c         |   23 +++++++++++++++++++----
 4 files changed, 49 insertions(+), 19 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 45aa7648dca6..ffa58a7a689d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -35,6 +35,7 @@  struct svc_pool {
 	spinlock_t		sp_lock;	/* protects sp_sockets */
 	struct list_head	sp_sockets;	/* pending sockets */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
+	unsigned long		*sp_idle_map;	/* idle threads */
 	struct xarray		sp_thread_xa;
 
 	/* statistics on pool operation */
@@ -189,6 +190,8 @@  extern u32 svc_max_payload(const struct svc_rqst *rqstp);
 #define RPCSVC_MAXPAGES		((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE \
 				+ 2 + 1)
 
+#define RPCSVC_MAXPOOLTHREADS	(256)
+
 /*
  * The context of a single thread, including the request currently being
  * processed.
@@ -238,8 +241,7 @@  struct svc_rqst {
 #define	RQ_SPLICE_OK	(4)			/* turned off in gss privacy
 						 * to prevent encrypting page
 						 * cache pages */
-#define	RQ_BUSY		(5)			/* request is busy */
-#define	RQ_DATA		(6)			/* request has data */
+#define	RQ_DATA		(5)			/* request has data */
 	unsigned long		rq_flags;	/* flags field */
 	u32			rq_thread_id;	/* xarray index */
 	ktime_t			rq_qtime;	/* enqueue time */
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 4ec746048f15..f64c255975ab 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1600,7 +1600,6 @@  DEFINE_SVCXDRBUF_EVENT(sendto);
 	svc_rqst_flag(USEDEFERRAL)					\
 	svc_rqst_flag(DROPME)						\
 	svc_rqst_flag(SPLICE_OK)					\
-	svc_rqst_flag(BUSY)						\
 	svc_rqst_flag_end(DATA)
 
 #undef svc_rqst_flag
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 18fbb98895ea..c2cba61a890c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -509,6 +509,12 @@  __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		INIT_LIST_HEAD(&pool->sp_sockets);
 		spin_lock_init(&pool->sp_lock);
 		xa_init_flags(&pool->sp_thread_xa, XA_FLAGS_ALLOC);
+		/* All threads initially marked "busy" */
+		pool->sp_idle_map =
+			bitmap_zalloc_node(RPCSVC_MAXPOOLTHREADS, GFP_KERNEL,
+					   svc_pool_map_get_node(i));
+		if (!pool->sp_idle_map)
+			return NULL;
 
 		percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL);
 		percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL);
@@ -594,6 +600,8 @@  svc_destroy(struct kref *ref)
 		percpu_counter_destroy(&pool->sp_threads_starved);
 
 		xa_destroy(&pool->sp_thread_xa);
+		bitmap_free(pool->sp_idle_map);
+		pool->sp_idle_map = NULL;
 	}
 	kfree(serv->sv_pools);
 	kfree(serv);
@@ -645,7 +653,6 @@  svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
 
 	folio_batch_init(&rqstp->rq_fbatch);
 
-	__set_bit(RQ_BUSY, &rqstp->rq_flags);
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
@@ -675,7 +682,7 @@  static struct svc_rqst *
 svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 {
 	static const struct xa_limit limit = {
-		.max = UINT_MAX,
+		.max = RPCSVC_MAXPOOLTHREADS,
 	};
 	struct svc_rqst	*rqstp;
 	int ret;
@@ -720,18 +727,24 @@  struct svc_rqst *svc_pool_wake_idle_thread(struct svc_serv *serv,
 					   struct svc_pool *pool)
 {
 	struct svc_rqst	*rqstp;
-	unsigned long index;
+	unsigned long bit;
 
-	xa_for_each(&pool->sp_thread_xa, index, rqstp) {
-		if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags))
-			continue;
+	bit = 0;
+	do {
+		bit = find_next_bit(pool->sp_idle_map, pool->sp_nrthreads, bit);
+		if (bit == pool->sp_nrthreads)
+			goto out_starved;
+	} while (!test_and_clear_bit(bit, pool->sp_idle_map));
 
-		WRITE_ONCE(rqstp->rq_qtime, ktime_get());
-		wake_up_process(rqstp->rq_task);
-		percpu_counter_inc(&pool->sp_threads_woken);
-		return rqstp;
-	}
+	rqstp = xa_find(&pool->sp_thread_xa, &bit, bit, XA_PRESENT);
+	if (!rqstp)
+		goto out_starved;
+	WRITE_ONCE(rqstp->rq_qtime, ktime_get());
+	wake_up_process(rqstp->rq_task);
+	percpu_counter_inc(&pool->sp_threads_woken);
+	return rqstp;
 
+out_starved:
 	trace_svc_pool_starved(serv, pool);
 	percpu_counter_inc(&pool->sp_threads_starved);
 	return NULL;
@@ -765,7 +778,8 @@  svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *stat
 	}
 
 found_pool:
-	rqstp = xa_find(&pool->sp_thread_xa, &zero, U32_MAX, XA_PRESENT);
+	rqstp = xa_find(&pool->sp_thread_xa, &zero, RPCSVC_MAXPOOLTHREADS,
+			XA_PRESENT);
 	if (rqstp) {
 		__xa_erase(&pool->sp_thread_xa, rqstp->rq_thread_id);
 		task = rqstp->rq_task;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 77fc20b2181d..e22f1432aabb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -734,6 +734,18 @@  rqst_should_sleep(struct svc_rqst *rqstp)
 	return true;
 }
 
+static void svc_rqst_mark_idle(struct svc_rqst *rqstp)
+{
+	set_bit(rqstp->rq_thread_id, rqstp->rq_pool->sp_idle_map);
+	smp_mb__after_atomic();
+}
+
+static void svc_rqst_mark_busy(struct svc_rqst *rqstp)
+{
+	clear_bit(rqstp->rq_thread_id, rqstp->rq_pool->sp_idle_map);
+	smp_mb__after_atomic();
+}
+
 static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 {
 	struct svc_pool		*pool = rqstp->rq_pool;
@@ -755,8 +767,7 @@  static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 	set_current_state(TASK_INTERRUPTIBLE);
 	smp_mb__before_atomic();
 	clear_bit(SP_CONGESTED, &pool->sp_flags);
-	clear_bit(RQ_BUSY, &rqstp->rq_flags);
-	smp_mb__after_atomic();
+	svc_rqst_mark_idle(rqstp);
 
 	if (likely(rqst_should_sleep(rqstp)))
 		time_left = schedule_timeout(timeout);
@@ -765,8 +776,12 @@  static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 
 	try_to_freeze();
 
-	set_bit(RQ_BUSY, &rqstp->rq_flags);
-	smp_mb__after_atomic();
+	/* Post-sleep: look for more work.
+	 *
+	 * Note: If we were awoken, then this rqstp has already
+	 * been marked busy.
+	 */
+	svc_rqst_mark_busy(rqstp);
 	rqstp->rq_xprt = svc_xprt_dequeue(pool);
 	if (rqstp->rq_xprt) {
 		trace_svc_pool_awoken(pool, rqstp);