bpf: cpumap use ptr_ring_consume_batched
authorJesper Dangaard Brouer <brouer@redhat.com>
Fri, 12 Apr 2019 15:07:32 +0000 (17:07 +0200)
committerAlexei Starovoitov <ast@kernel.org>
Thu, 18 Apr 2019 02:09:24 +0000 (19:09 -0700)
Move ptr_ring dequeue outside loop, that allocate SKBs and calls network
stack, as these operations that can take some time. The ptr_ring is a
communication channel between CPUs, where we want to reduce/limit any
cacheline bouncing.

Do a concentrated bulk dequeue via ptr_ring_consume_batched, to shorten the
period and times the remote cacheline in ptr_ring is read

Batch size 8 is both to (1) limit BH-disable period, and (2) consume one
cacheline on 64-bit archs. After reducing the BH-disable section further
then we can consider changing this, while still thinking about L1 cacheline
size being active.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
kernel/bpf/cpumap.c

index 3c18260403dde1df951448c600b6ef9ac61f5635..430103e182a04d780754d209f89ddc79ae717476 100644 (file)
@@ -240,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
        }
 }
 
+#define CPUMAP_BATCH 8
+
 static int cpu_map_kthread_run(void *data)
 {
        struct bpf_cpu_map_entry *rcpu = data;
@@ -252,8 +254,9 @@ static int cpu_map_kthread_run(void *data)
         * kthread_stop signal until queue is empty.
         */
        while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
-               unsigned int processed = 0, drops = 0, sched = 0;
-               struct xdp_frame *xdpf;
+               unsigned int drops = 0, sched = 0;
+               void *frames[CPUMAP_BATCH];
+               int i, n;
 
                /* Release CPU reschedule checks */
                if (__ptr_ring_empty(rcpu->queue)) {
@@ -269,14 +272,16 @@ static int cpu_map_kthread_run(void *data)
                        sched = cond_resched();
                }
 
-               /* Process packets in rcpu->queue */
-               local_bh_disable();
                /*
                 * The bpf_cpu_map_entry is single consumer, with this
                 * kthread CPU pinned. Lockless access to ptr_ring
                 * consume side valid as no-resize allowed of queue.
                 */
-               while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
+               n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
+
+               local_bh_disable();
+               for (i = 0; i < n; i++) {
+                       struct xdp_frame *xdpf = frames[i];
                        struct sk_buff *skb;
                        int ret;
 
@@ -290,13 +295,9 @@ static int cpu_map_kthread_run(void *data)
                        ret = netif_receive_skb_core(skb);
                        if (ret == NET_RX_DROP)
                                drops++;
-
-                       /* Limit BH-disable period */
-                       if (++processed == 8)
-                               break;
                }
                /* Feedback loop via tracepoint */
-               trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+               trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
 
                local_bh_enable(); /* resched point, may call do_softirq() */
        }