xprtrdma: Spread reply processing over more CPUs
authorChuck Lever <chuck.lever@oracle.com>
Mon, 4 Dec 2017 19:04:04 +0000 (14:04 -0500)
committerAnna Schumaker <Anna.Schumaker@Netapp.com>
Fri, 15 Dec 2017 19:31:50 +0000 (14:31 -0500)
Commit d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler
directly from RECV completion") introduced a performance regression
for NFS I/O small enough to not need memory registration. In multi-
threaded benchmarks that generate primarily small I/O requests,
IOPS throughput is reduced by nearly a third. This patch restores
the previous level of throughput.

Because workqueues are typically BOUND (in particular ib_comp_wq,
nfsiod_workqueue, and rpciod_workqueue), NFS/RDMA workloads tend
to aggregate on the CPU that is handling Receive completions.

The usual approach to addressing this problem is to create a QP
and CQ for each CPU, and then schedule transactions on the QP
for the CPU where you want the transaction to complete. The
transaction then does not require an extra context switch during
completion to end up on the same CPU where the transaction was
started.

This approach doesn't work for the Linux NFS/RDMA client because
currently the Linux NFS client does not support multiple connections
per client-server pair, and the RDMA core API does not make it
straightforward for ULPs to determine which CPU is responsible for
handling Receive completions for a CQ.

So for the moment, record the CPU number in the rpcrdma_req before
the transport sends each RPC Call. Then during Receive completion,
queue the RPC completion on that same CPU.

Additionally, move all RPC completion processing to the deferred
handler so that even RPCs with simple small replies complete on
the CPU that sent the corresponding RPC Call.

Fixes: d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler ...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h

index ed34dc0f144cce537fce51dcba5bb12fe0b6df1c..a3f2ab283aeba38b26514dd9eb0e948c71a9ee7e 100644 (file)
@@ -1408,11 +1408,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
        dprintk("RPC:       %s: reply %p completes request %p (xid 0x%08x)\n",
                __func__, rep, req, be32_to_cpu(rep->rr_xid));
 
-       if (list_empty(&req->rl_registered) &&
-           !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags))
-               rpcrdma_complete_rqst(rep);
-       else
-               queue_work(rpcrdma_receive_wq, &rep->rr_work);
+       queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work);
        return;
 
 out_badstatus:
index 646c24494ea7eba7fb2a2296ba6339e8dbf8f31e..6ee1ad8978f3b2977de2798d1a76ded1af6f78c4 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/sunrpc/addr.h>
+#include <linux/smp.h>
 
 #include "xprt_rdma.h"
 
@@ -656,6 +657,7 @@ xprt_rdma_allocate(struct rpc_task *task)
                task->tk_pid, __func__, rqst->rq_callsize,
                rqst->rq_rcvsize, req);
 
+       req->rl_cpu = smp_processor_id();
        req->rl_connect_cookie = 0;     /* our reserved value */
        rpcrdma_set_xprtdata(rqst, req);
        rqst->rq_buffer = req->rl_sendbuf->rg_base;
index 710b3f77db82869cd23abb90ea308ca67beef2bf..8607c029c0dd820250f4547c68bda41b7daca313 100644 (file)
@@ -83,7 +83,7 @@ rpcrdma_alloc_wq(void)
        struct workqueue_struct *recv_wq;
 
        recv_wq = alloc_workqueue("xprtrdma_receive",
-                                 WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI,
+                                 WQ_MEM_RECLAIM | WQ_HIGHPRI,
                                  0);
        if (!recv_wq)
                return -ENOMEM;
index 51686d9eac5f992d9d23d674f94df0e77f58bb72..1342f743f1c41acae0145a49962825aa1574311c 100644 (file)
@@ -342,6 +342,7 @@ enum {
 struct rpcrdma_buffer;
 struct rpcrdma_req {
        struct list_head        rl_list;
+       int                     rl_cpu;
        unsigned int            rl_connect_cookie;
        struct rpcrdma_buffer   *rl_buffer;
        struct rpcrdma_rep      *rl_reply;