i40e: add AF_XDP zero-copy Rx support
authorBjörn Töpel <bjorn.topel@intel.com>
Tue, 28 Aug 2018 12:44:32 +0000 (14:44 +0200)
committerAlexei Starovoitov <ast@kernel.org>
Wed, 29 Aug 2018 19:25:53 +0000 (12:25 -0700)
This patch adds zero-copy Rx support for AF_XDP sockets. Instead of
allocating buffers of type MEM_TYPE_PAGE_SHARED, the Rx frames are
allocated as MEM_TYPE_ZERO_COPY when AF_XDP is enabled for a certain
queue.

All AF_XDP specific functions are added to a new file, i40e_xsk.c.

Note that when AF_XDP zero-copy is enabled, the XDP action XDP_PASS
will allocate a new buffer and copy the zero-copy frame prior passing
it to the kernel stack.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
drivers/net/ethernet/intel/i40e/Makefile
drivers/net/ethernet/intel/i40e/i40e.h
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/i40e/i40e_txrx.h
drivers/net/ethernet/intel/i40e/i40e_xsk.c [new file with mode: 0644]
drivers/net/ethernet/intel/i40e/i40e_xsk.h [new file with mode: 0644]

index 14397e7e9925e352a5454ef5207cb4f032d067e3..50590e8d1fd1389926a549a7ebd7c47a0ac9e28e 100644 (file)
@@ -22,6 +22,7 @@ i40e-objs := i40e_main.o \
        i40e_txrx.o     \
        i40e_ptp.o      \
        i40e_client.o   \
-       i40e_virtchnl_pf.o
+       i40e_virtchnl_pf.o \
+       i40e_xsk.o
 
 i40e-$(CONFIG_I40E_DCB) += i40e_dcb.o i40e_dcb_nl.o
index 7a80652e25008503ae17fbafa158337266c00f90..876cac317e795ae33bcc2cf35b60304e04fa4544 100644 (file)
@@ -786,6 +786,11 @@ struct i40e_vsi {
 
        /* VSI specific handlers */
        irqreturn_t (*irq_handler)(int irq, void *data);
+
+       /* AF_XDP zero-copy */
+       struct xdp_umem **xsk_umems;
+       u16 num_xsk_umems_used;
+       u16 num_xsk_umems;
 } ____cacheline_internodealigned_in_smp;
 
 struct i40e_netdev_priv {
@@ -1090,6 +1095,20 @@ static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi *vsi)
        return !!vsi->xdp_prog;
 }
 
+static inline struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring)
+{
+       bool xdp_on = i40e_enabled_xdp_vsi(ring->vsi);
+       int qid = ring->queue_index;
+
+       if (ring_is_xdp(ring))
+               qid -= ring->vsi->alloc_queue_pairs;
+
+       if (!ring->vsi->xsk_umems || !ring->vsi->xsk_umems[qid] || !xdp_on)
+               return NULL;
+
+       return ring->vsi->xsk_umems[qid];
+}
+
 int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
 int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate);
 int i40e_add_del_cloud_filter(struct i40e_vsi *vsi,
index d8b5a6af72bdd554414cf469ec3b1e28a32c03f3..848eea7c84dbd8332320adab2961021cf54fa064 100644 (file)
@@ -9,7 +9,9 @@
 /* Local includes */
 #include "i40e.h"
 #include "i40e_diag.h"
+#include "i40e_xsk.h"
 #include <net/udp_tunnel.h>
+#include <net/xdp_sock.h>
 /* All i40e tracepoints are defined by the include below, which
  * must be included exactly once across the whole kernel with
  * CREATE_TRACE_POINTS defined
@@ -3181,13 +3183,46 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
        struct i40e_hw *hw = &vsi->back->hw;
        struct i40e_hmc_obj_rxq rx_ctx;
        i40e_status err = 0;
+       bool ok;
+       int ret;
 
        bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
 
        /* clear the context structure first */
        memset(&rx_ctx, 0, sizeof(rx_ctx));
 
-       ring->rx_buf_len = vsi->rx_buf_len;
+       if (ring->vsi->type == I40E_VSI_MAIN)
+               xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
+
+       ring->xsk_umem = i40e_xsk_umem(ring);
+       if (ring->xsk_umem) {
+               ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
+                                  XDP_PACKET_HEADROOM;
+               /* For AF_XDP ZC, we disallow packets to span on
+                * multiple buffers, thus letting us skip that
+                * handling in the fast-path.
+                */
+               chain_len = 1;
+               ring->zca.free = i40e_zca_free;
+               ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                MEM_TYPE_ZERO_COPY,
+                                                &ring->zca);
+               if (ret)
+                       return ret;
+               dev_info(&vsi->back->pdev->dev,
+                        "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+                        ring->queue_index);
+
+       } else {
+               ring->rx_buf_len = vsi->rx_buf_len;
+               if (ring->vsi->type == I40E_VSI_MAIN) {
+                       ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                        MEM_TYPE_PAGE_SHARED,
+                                                        NULL);
+                       if (ret)
+                               return ret;
+               }
+       }
 
        rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
                                    BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT));
@@ -3243,7 +3278,15 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
        ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
        writel(0, ring->tail);
 
-       i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+       ok = ring->xsk_umem ?
+            i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) :
+            !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+       if (!ok) {
+               dev_info(&vsi->back->pdev->dev,
+                        "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n",
+                        ring->xsk_umem ? "UMEM enabled " : "",
+                        ring->queue_index, pf_q);
+       }
 
        return 0;
 }
@@ -12097,6 +12140,12 @@ static int i40e_xdp(struct net_device *dev,
        case XDP_QUERY_PROG:
                xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
                return 0;
+       case XDP_QUERY_XSK_UMEM:
+               return i40e_xsk_umem_query(vsi, &xdp->xsk.umem,
+                                          xdp->xsk.queue_id);
+       case XDP_SETUP_XSK_UMEM:
+               return i40e_xsk_umem_setup(vsi, xdp->xsk.umem,
+                                          xdp->xsk.queue_id);
        default:
                return -EINVAL;
        }
index 878fb4b47484b86b55fe0e603ccd8455327cfa31..2c4d179ffebfd4c0625958479bebad09723931c4 100644 (file)
@@ -9,6 +9,7 @@
 #include "i40e_trace.h"
 #include "i40e_prototype.h"
 #include "i40e_txrx_common.h"
+#include "i40e_xsk.h"
 
 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
                                u32 td_tag)
@@ -1380,6 +1381,9 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
                rx_ring->skb = NULL;
        }
 
+       if (rx_ring->xsk_umem)
+               goto skip_free;
+
        /* Free all the Rx ring sk_buffs */
        for (i = 0; i < rx_ring->count; i++) {
                struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
@@ -1408,6 +1412,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
                rx_bi->page_offset = 0;
        }
 
+skip_free:
        bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
        memset(rx_ring->rx_bi, 0, bi_size);
 
@@ -2641,7 +2646,9 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
        budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
 
        i40e_for_each_ring(ring, q_vector->rx) {
-               int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
+               int cleaned = ring->xsk_umem ?
+                             i40e_clean_rx_irq_zc(ring, budget_per_ring) :
+                             i40e_clean_rx_irq(ring, budget_per_ring);
 
                work_done += cleaned;
                /* if we clean as many as budgeted, we must not be done */
index bb04f6a731fe995855a69f0b963c149be2c252b4..100e92d2982f2d7e7fc8683272948a974db2f467 100644 (file)
@@ -296,13 +296,17 @@ struct i40e_tx_buffer {
 
 struct i40e_rx_buffer {
        dma_addr_t dma;
-       struct page *page;
-#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
-       __u32 page_offset;
-#else
-       __u16 page_offset;
-#endif
-       __u16 pagecnt_bias;
+       union {
+               struct {
+                       struct page *page;
+                       __u32 page_offset;
+                       __u16 pagecnt_bias;
+               };
+               struct {
+                       void *addr;
+                       u64 handle;
+               };
+       };
 };
 
 struct i40e_queue_stats {
@@ -414,6 +418,8 @@ struct i40e_ring {
 
        struct i40e_channel *ch;
        struct xdp_rxq_info xdp_rxq;
+       struct xdp_umem *xsk_umem;
+       struct zero_copy_allocator zca; /* ZC allocator anchor */
 } ____cacheline_internodealigned_in_smp;
 
 static inline bool ring_uses_build_skb(struct i40e_ring *ring)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
new file mode 100644 (file)
index 0000000..bf502f2
--- /dev/null
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Intel Corporation. */
+
+#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <net/xdp.h>
+
+#include "i40e.h"
+#include "i40e_txrx_common.h"
+#include "i40e_xsk.h"
+
+/**
+ * i40e_alloc_xsk_umems - Allocate an array to store per ring UMEMs
+ * @vsi: Current VSI
+ *
+ * Returns 0 on success, <0 on failure
+ **/
+static int i40e_alloc_xsk_umems(struct i40e_vsi *vsi)
+{
+       if (vsi->xsk_umems)
+               return 0;
+
+       vsi->num_xsk_umems_used = 0;
+       vsi->num_xsk_umems = vsi->alloc_queue_pairs;
+       vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems),
+                                GFP_KERNEL);
+       if (!vsi->xsk_umems) {
+               vsi->num_xsk_umems = 0;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+/**
+ * i40e_add_xsk_umem - Store an UMEM for a certain ring/qid
+ * @vsi: Current VSI
+ * @umem: UMEM to store
+ * @qid: Ring/qid to associate with the UMEM
+ *
+ * Returns 0 on success, <0 on failure
+ **/
+static int i40e_add_xsk_umem(struct i40e_vsi *vsi, struct xdp_umem *umem,
+                            u16 qid)
+{
+       int err;
+
+       err = i40e_alloc_xsk_umems(vsi);
+       if (err)
+               return err;
+
+       vsi->xsk_umems[qid] = umem;
+       vsi->num_xsk_umems_used++;
+
+       return 0;
+}
+
+/**
+ * i40e_remove_xsk_umem - Remove an UMEM for a certain ring/qid
+ * @vsi: Current VSI
+ * @qid: Ring/qid associated with the UMEM
+ **/
+static void i40e_remove_xsk_umem(struct i40e_vsi *vsi, u16 qid)
+{
+       vsi->xsk_umems[qid] = NULL;
+       vsi->num_xsk_umems_used--;
+
+       if (vsi->num_xsk_umems == 0) {
+               kfree(vsi->xsk_umems);
+               vsi->xsk_umems = NULL;
+               vsi->num_xsk_umems = 0;
+       }
+}
+
+/**
+ * i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev
+ * @vsi: Current VSI
+ * @umem: UMEM to DMA map
+ *
+ * Returns 0 on success, <0 on failure
+ **/
+static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem)
+{
+       struct i40e_pf *pf = vsi->back;
+       struct device *dev;
+       unsigned int i, j;
+       dma_addr_t dma;
+
+       dev = &pf->pdev->dev;
+       for (i = 0; i < umem->npgs; i++) {
+               dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
+                                        DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
+               if (dma_mapping_error(dev, dma))
+                       goto out_unmap;
+
+               umem->pages[i].dma = dma;
+       }
+
+       return 0;
+
+out_unmap:
+       for (j = 0; j < i; j++) {
+               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+                                    DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
+               umem->pages[i].dma = 0;
+       }
+
+       return -1;
+}
+
+/**
+ * i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev
+ * @vsi: Current VSI
+ * @umem: UMEM to DMA map
+ **/
+static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem)
+{
+       struct i40e_pf *pf = vsi->back;
+       struct device *dev;
+       unsigned int i;
+
+       dev = &pf->pdev->dev;
+
+       for (i = 0; i < umem->npgs; i++) {
+               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
+                                    DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
+
+               umem->pages[i].dma = 0;
+       }
+}
+
+/**
+ * i40e_xsk_umem_enable - Enable/associate an UMEM to a certain ring/qid
+ * @vsi: Current VSI
+ * @umem: UMEM
+ * @qid: Rx ring to associate UMEM to
+ *
+ * Returns 0 on success, <0 on failure
+ **/
+static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
+                               u16 qid)
+{
+       bool if_running;
+       int err;
+
+       if (vsi->type != I40E_VSI_MAIN)
+               return -EINVAL;
+
+       if (qid >= vsi->num_queue_pairs)
+               return -EINVAL;
+
+       if (vsi->xsk_umems) {
+               if (qid >= vsi->num_xsk_umems)
+                       return -EINVAL;
+               if (vsi->xsk_umems[qid])
+                       return -EBUSY;
+       }
+
+       err = i40e_xsk_umem_dma_map(vsi, umem);
+       if (err)
+               return err;
+
+       if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi);
+
+       if (if_running) {
+               err = i40e_queue_pair_disable(vsi, qid);
+               if (err)
+                       return err;
+       }
+
+       err = i40e_add_xsk_umem(vsi, umem, qid);
+       if (err)
+               return err;
+
+       if (if_running) {
+               err = i40e_queue_pair_enable(vsi, qid);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+/**
+ * i40e_xsk_umem_disable - Diassociate an UMEM from a certain ring/qid
+ * @vsi: Current VSI
+ * @qid: Rx ring to associate UMEM to
+ *
+ * Returns 0 on success, <0 on failure
+ **/
+static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid)
+{
+       bool if_running;
+       int err;
+
+       if (!vsi->xsk_umems || qid >= vsi->num_xsk_umems ||
+           !vsi->xsk_umems[qid])
+               return -EINVAL;
+
+       if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi);
+
+       if (if_running) {
+               err = i40e_queue_pair_disable(vsi, qid);
+               if (err)
+                       return err;
+       }
+
+       i40e_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]);
+       i40e_remove_xsk_umem(vsi, qid);
+
+       if (if_running) {
+               err = i40e_queue_pair_enable(vsi, qid);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+/**
+ * i40e_xsk_umem_query - Queries a certain ring/qid for its UMEM
+ * @vsi: Current VSI
+ * @umem: UMEM associated to the ring, if any
+ * @qid: Rx ring to associate UMEM to
+ *
+ * This function will store, if any, the UMEM associated to certain ring.
+ *
+ * Returns 0 on success, <0 on failure
+ **/
+int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem,
+                       u16 qid)
+{
+       if (vsi->type != I40E_VSI_MAIN)
+               return -EINVAL;
+
+       if (qid >= vsi->num_queue_pairs)
+               return -EINVAL;
+
+       if (vsi->xsk_umems) {
+               if (qid >= vsi->num_xsk_umems)
+                       return -EINVAL;
+               *umem = vsi->xsk_umems[qid];
+               return 0;
+       }
+
+       *umem = NULL;
+       return 0;
+}
+
+/**
+ * i40e_xsk_umem_query - Queries a certain ring/qid for its UMEM
+ * @vsi: Current VSI
+ * @umem: UMEM to enable/associate to a ring, or NULL to disable
+ * @qid: Rx ring to (dis)associate UMEM (from)to
+ *
+ * This function enables or disables an UMEM to a certain ring.
+ *
+ * Returns 0 on success, <0 on failure
+ **/
+int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
+                       u16 qid)
+{
+       return umem ? i40e_xsk_umem_enable(vsi, umem, qid) :
+               i40e_xsk_umem_disable(vsi, qid);
+}
+
+/**
+ * i40e_run_xdp_zc - Executes an XDP program on an xdp_buff
+ * @rx_ring: Rx ring
+ * @xdp: xdp_buff used as input to the XDP program
+ *
+ * This function enables or disables an UMEM to a certain ring.
+ *
+ * Returns any of I40E_XDP_{PASS, CONSUMED, TX, REDIR}
+ **/
+static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
+{
+       int err, result = I40E_XDP_PASS;
+       struct i40e_ring *xdp_ring;
+       struct bpf_prog *xdp_prog;
+       u32 act;
+
+       rcu_read_lock();
+       /* NB! xdp_prog will always be !NULL, due to the fact that
+        * this path is enabled by setting an XDP program.
+        */
+       xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+       act = bpf_prog_run_xdp(xdp_prog, xdp);
+       xdp->handle += xdp->data - xdp->data_hard_start;
+       switch (act) {
+       case XDP_PASS:
+               break;
+       case XDP_TX:
+               xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
+               result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
+               break;
+       case XDP_REDIRECT:
+               err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+               result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED;
+               break;
+       default:
+               bpf_warn_invalid_xdp_action(act);
+       case XDP_ABORTED:
+               trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+               /* fallthrough -- handle aborts by dropping packet */
+       case XDP_DROP:
+               result = I40E_XDP_CONSUMED;
+               break;
+       }
+       rcu_read_unlock();
+       return result;
+}
+
+/**
+ * i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer
+ * @rx_ring: Rx ring
+ * @bi: Rx buffer to populate
+ *
+ * This function allocates an Rx buffer. The buffer can come from fill
+ * queue, or via the recycle queue (next_to_alloc).
+ *
+ * Returns true for a successful allocation, false otherwise
+ **/
+static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
+                                struct i40e_rx_buffer *bi)
+{
+       struct xdp_umem *umem = rx_ring->xsk_umem;
+       void *addr = bi->addr;
+       u64 handle, hr;
+
+       if (addr) {
+               rx_ring->rx_stats.page_reuse_count++;
+               return true;
+       }
+
+       if (!xsk_umem_peek_addr(umem, &handle)) {
+               rx_ring->rx_stats.alloc_page_failed++;
+               return false;
+       }
+
+       hr = umem->headroom + XDP_PACKET_HEADROOM;
+
+       bi->dma = xdp_umem_get_dma(umem, handle);
+       bi->dma += hr;
+
+       bi->addr = xdp_umem_get_data(umem, handle);
+       bi->addr += hr;
+
+       bi->handle = handle + umem->headroom;
+
+       xsk_umem_discard_addr(umem);
+       return true;
+}
+
+/**
+ * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * This function allocates a number of Rx buffers and places them on
+ * the Rx ring.
+ *
+ * Returns true for a successful allocation, false otherwise
+ **/
+bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
+{
+       u16 ntu = rx_ring->next_to_use;
+       union i40e_rx_desc *rx_desc;
+       struct i40e_rx_buffer *bi;
+       bool ok = true;
+
+       rx_desc = I40E_RX_DESC(rx_ring, ntu);
+       bi = &rx_ring->rx_bi[ntu];
+       do {
+               if (!i40e_alloc_buffer_zc(rx_ring, bi)) {
+                       ok = false;
+                       goto no_buffers;
+               }
+
+               dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0,
+                                                rx_ring->rx_buf_len,
+                                                DMA_BIDIRECTIONAL);
+
+               rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+
+               rx_desc++;
+               bi++;
+               ntu++;
+
+               if (unlikely(ntu == rx_ring->count)) {
+                       rx_desc = I40E_RX_DESC(rx_ring, 0);
+                       bi = rx_ring->rx_bi;
+                       ntu = 0;
+               }
+
+               rx_desc->wb.qword1.status_error_len = 0;
+               count--;
+       } while (count);
+
+no_buffers:
+       if (rx_ring->next_to_use != ntu)
+               i40e_release_rx_desc(rx_ring, ntu);
+
+       return ok;
+}
+
+/**
+ * i40e_get_rx_buffer_zc - Return the current Rx buffer
+ * @rx_ring: Rx ring
+ * @size: The size of the rx buffer (read from descriptor)
+ *
+ * This function returns the current, received Rx buffer, and also
+ * does DMA synchronization.  the Rx ring.
+ *
+ * Returns the received Rx buffer
+ **/
+static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring,
+                                                   const unsigned int size)
+{
+       struct i40e_rx_buffer *bi;
+
+       bi = &rx_ring->rx_bi[rx_ring->next_to_clean];
+
+       /* we are reusing so sync this buffer for CPU use */
+       dma_sync_single_range_for_cpu(rx_ring->dev,
+                                     bi->dma, 0,
+                                     size,
+                                     DMA_BIDIRECTIONAL);
+
+       return bi;
+}
+
+/**
+ * i40e_reuse_rx_buffer_zc - Recycle an Rx buffer
+ * @rx_ring: Rx ring
+ * @old_bi: The Rx buffer to recycle
+ *
+ * This function recycles a finished Rx buffer, and places it on the
+ * recycle queue (next_to_alloc).
+ **/
+static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
+                                   struct i40e_rx_buffer *old_bi)
+{
+       struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
+       unsigned long mask = (unsigned long)rx_ring->xsk_umem->props.chunk_mask;
+       u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
+       u16 nta = rx_ring->next_to_alloc;
+
+       /* update, and store next to alloc */
+       nta++;
+       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+       /* transfer page from old buffer to new buffer */
+       new_bi->dma = old_bi->dma & mask;
+       new_bi->dma += hr;
+
+       new_bi->addr = (void *)((unsigned long)old_bi->addr & mask);
+       new_bi->addr += hr;
+
+       new_bi->handle = old_bi->handle & mask;
+       new_bi->handle += rx_ring->xsk_umem->headroom;
+
+       old_bi->addr = NULL;
+}
+
+/**
+ * i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations
+ * @alloc: Zero-copy allocator
+ * @handle: Buffer handle
+ **/
+void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
+{
+       struct i40e_rx_buffer *bi;
+       struct i40e_ring *rx_ring;
+       u64 hr, mask;
+       u16 nta;
+
+       rx_ring = container_of(alloc, struct i40e_ring, zca);
+       hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
+       mask = rx_ring->xsk_umem->props.chunk_mask;
+
+       nta = rx_ring->next_to_alloc;
+       bi = &rx_ring->rx_bi[nta];
+
+       nta++;
+       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+       handle &= mask;
+
+       bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
+       bi->dma += hr;
+
+       bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
+       bi->addr += hr;
+
+       bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
+}
+
+/**
+ * i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer
+ * @rx_ring: Rx ring
+ * @bi: Rx buffer
+ * @xdp: xdp_buff
+ *
+ * This functions allocates a new skb from a zero-copy Rx buffer.
+ *
+ * Returns the skb, or NULL on failure.
+ **/
+static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
+                                            struct i40e_rx_buffer *bi,
+                                            struct xdp_buff *xdp)
+{
+       unsigned int metasize = xdp->data - xdp->data_meta;
+       unsigned int datasize = xdp->data_end - xdp->data;
+       struct sk_buff *skb;
+
+       /* allocate a skb to store the frags */
+       skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
+                              xdp->data_end - xdp->data_hard_start,
+                              GFP_ATOMIC | __GFP_NOWARN);
+       if (unlikely(!skb))
+               return NULL;
+
+       skb_reserve(skb, xdp->data - xdp->data_hard_start);
+       memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+       if (metasize)
+               skb_metadata_set(skb, metasize);
+
+       i40e_reuse_rx_buffer_zc(rx_ring, bi);
+       return skb;
+}
+
+/**
+ * i40e_inc_ntc: Advance the next_to_clean index
+ * @rx_ring: Rx ring
+ **/
+static void i40e_inc_ntc(struct i40e_ring *rx_ring)
+{
+       u32 ntc = rx_ring->next_to_clean + 1;
+
+       ntc = (ntc < rx_ring->count) ? ntc : 0;
+       rx_ring->next_to_clean = ntc;
+       prefetch(I40E_RX_DESC(rx_ring, ntc));
+}
+
+/**
+ * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
+ * @rx_ring: Rx ring
+ * @budget: NAPI budget
+ *
+ * Returns amount of work completed
+ **/
+int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
+{
+       unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+       u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
+       unsigned int xdp_res, xdp_xmit = 0;
+       bool failure = false;
+       struct sk_buff *skb;
+       struct xdp_buff xdp;
+
+       xdp.rxq = &rx_ring->xdp_rxq;
+
+       while (likely(total_rx_packets < (unsigned int)budget)) {
+               struct i40e_rx_buffer *bi;
+               union i40e_rx_desc *rx_desc;
+               unsigned int size;
+               u16 vlan_tag;
+               u8 rx_ptype;
+               u64 qword;
+
+               if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
+                       failure = failure ||
+                                 !i40e_alloc_rx_buffers_zc(rx_ring,
+                                                           cleaned_count);
+                       cleaned_count = 0;
+               }
+
+               rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
+               qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+
+               /* This memory barrier is needed to keep us from reading
+                * any other fields out of the rx_desc until we have
+                * verified the descriptor has been written back.
+                */
+               dma_rmb();
+
+               bi = i40e_clean_programming_status(rx_ring, rx_desc,
+                                                  qword);
+               if (unlikely(bi)) {
+                       i40e_reuse_rx_buffer_zc(rx_ring, bi);
+                       cleaned_count++;
+                       continue;
+               }
+
+               size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
+                      I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
+               if (!size)
+                       break;
+
+               bi = i40e_get_rx_buffer_zc(rx_ring, size);
+               xdp.data = bi->addr;
+               xdp.data_meta = xdp.data;
+               xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
+               xdp.data_end = xdp.data + size;
+               xdp.handle = bi->handle;
+
+               xdp_res = i40e_run_xdp_zc(rx_ring, &xdp);
+               if (xdp_res) {
+                       if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
+                               xdp_xmit |= xdp_res;
+                               bi->addr = NULL;
+                       } else {
+                               i40e_reuse_rx_buffer_zc(rx_ring, bi);
+                       }
+
+                       total_rx_bytes += size;
+                       total_rx_packets++;
+
+                       cleaned_count++;
+                       i40e_inc_ntc(rx_ring);
+                       continue;
+               }
+
+               /* XDP_PASS path */
+
+               /* NB! We are not checking for errors using
+                * i40e_test_staterr with
+                * BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that
+                * SBP is *not* set in PRT_SBPVSI (default not set).
+                */
+               skb = i40e_construct_skb_zc(rx_ring, bi, &xdp);
+               if (!skb) {
+                       rx_ring->rx_stats.alloc_buff_failed++;
+                       break;
+               }
+
+               cleaned_count++;
+               i40e_inc_ntc(rx_ring);
+
+               if (eth_skb_pad(skb))
+                       continue;
+
+               total_rx_bytes += skb->len;
+               total_rx_packets++;
+
+               qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+               rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
+                          I40E_RXD_QW1_PTYPE_SHIFT;
+               i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+
+               vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
+                          le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
+               i40e_receive_skb(rx_ring, skb, vlan_tag);
+       }
+
+       i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
+       i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
+       return failure ? budget : (int)total_rx_packets;
+}
+
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
new file mode 100644 (file)
index 0000000..427a844
--- /dev/null
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2018 Intel Corporation. */
+
+#ifndef _I40E_XSK_H_
+#define _I40E_XSK_H_
+
+struct i40e_vsi;
+struct xdp_umem;
+struct zero_copy_allocator;
+
+int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair);
+int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair);
+int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem,
+                       u16 qid);
+int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
+                       u16 qid);
+void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
+bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
+int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
+
+#endif /* _I40E_XSK_H_ */