sfc: Stop TX queues before they fill up
authorBen Hutchings <bhutchings@solarflare.com>
Tue, 22 May 2012 00:27:58 +0000 (01:27 +0100)
committerBen Hutchings <bhutchings@solarflare.com>
Fri, 24 Aug 2012 18:00:27 +0000 (19:00 +0100)
We now have a definite upper bound on the number of descriptors per
skb; use that to stop the queue when the next packet might not fit.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
drivers/net/ethernet/sfc/efx.c
drivers/net/ethernet/sfc/net_driver.h
drivers/net/ethernet/sfc/tx.c

index 65a8d49106a4c63c6a7faf04c8e3334421ce6472..3b3f08489a5eefa916ec9b69b385bdeb49a12dbb 100644 (file)
@@ -630,6 +630,16 @@ static void efx_start_datapath(struct efx_nic *efx)
        efx->rx_buffer_order = get_order(efx->rx_buffer_len +
                                         sizeof(struct efx_rx_page_state));
 
+       /* We must keep at least one descriptor in a TX ring empty.
+        * We could avoid this when the queue size does not exactly
+        * match the hardware ring size, but it's not that important.
+        * Therefore we stop the queue when one more skb might fill
+        * the ring completely.  We wake it when half way back to
+        * empty.
+        */
+       efx->txq_stop_thresh = efx->txq_entries - efx_tx_max_skb_descs(efx);
+       efx->txq_wake_thresh = efx->txq_stop_thresh / 2;
+
        /* Initialise the channels */
        efx_for_each_channel(channel, efx) {
                efx_for_each_channel_tx_queue(tx_queue, channel)
index 0ac01fa6e63cc8adfb59aad922e7c7b03a696eac..28a6d6258692e05bae4a4796ce67579135c4eead 100644 (file)
@@ -665,6 +665,8 @@ struct vfdi_status;
  *     should be allocated for this NIC
  * @rxq_entries: Size of receive queues requested by user.
  * @txq_entries: Size of transmit queues requested by user.
+ * @txq_stop_thresh: TX queue fill level at or above which we stop it.
+ * @txq_wake_thresh: TX queue fill level at or below which we wake it.
  * @tx_dc_base: Base qword address in SRAM of TX queue descriptor caches
  * @rx_dc_base: Base qword address in SRAM of RX queue descriptor caches
  * @sram_lim_qw: Qword address limit of SRAM
@@ -775,6 +777,9 @@ struct efx_nic {
 
        unsigned rxq_entries;
        unsigned txq_entries;
+       unsigned int txq_stop_thresh;
+       unsigned int txq_wake_thresh;
+
        unsigned tx_dc_base;
        unsigned rx_dc_base;
        unsigned sram_lim_qw;
index 24c82f3ce0f3c378815896db1c247d59b69a3072..330d9111a33911bd3124fca22dee768b429c5a77 100644 (file)
 #include "nic.h"
 #include "workarounds.h"
 
-/*
- * TX descriptor ring full threshold
- *
- * The tx_queue descriptor ring fill-level must fall below this value
- * before we restart the netif queue
- */
-#define EFX_TXQ_THRESHOLD(_efx) ((_efx)->txq_entries / 2u)
-
 static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
                               struct efx_tx_buffer *buffer,
                               unsigned int *pkts_compl,
@@ -138,6 +130,56 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
        return max_descs;
 }
 
+/* Get partner of a TX queue, seen as part of the same net core queue */
+static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue)
+{
+       if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD)
+               return tx_queue - EFX_TXQ_TYPE_OFFLOAD;
+       else
+               return tx_queue + EFX_TXQ_TYPE_OFFLOAD;
+}
+
+static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
+{
+       /* We need to consider both queues that the net core sees as one */
+       struct efx_tx_queue *txq2 = efx_tx_queue_partner(txq1);
+       struct efx_nic *efx = txq1->efx;
+       unsigned int fill_level;
+
+       fill_level = max(txq1->insert_count - txq1->old_read_count,
+                        txq2->insert_count - txq2->old_read_count);
+       if (likely(fill_level < efx->txq_stop_thresh))
+               return;
+
+       /* We used the stale old_read_count above, which gives us a
+        * pessimistic estimate of the fill level (which may even
+        * validly be >= efx->txq_entries).  Now try again using
+        * read_count (more likely to be a cache miss).
+        *
+        * If we read read_count and then conditionally stop the
+        * queue, it is possible for the completion path to race with
+        * us and complete all outstanding descriptors in the middle,
+        * after which there will be no more completions to wake it.
+        * Therefore we stop the queue first, then read read_count
+        * (with a memory barrier to ensure the ordering), then
+        * restart the queue if the fill level turns out to be low
+        * enough.
+        */
+       netif_tx_stop_queue(txq1->core_txq);
+       smp_mb();
+       txq1->old_read_count = ACCESS_ONCE(txq1->read_count);
+       txq2->old_read_count = ACCESS_ONCE(txq2->read_count);
+
+       fill_level = max(txq1->insert_count - txq1->old_read_count,
+                        txq2->insert_count - txq2->old_read_count);
+       EFX_BUG_ON_PARANOID(fill_level >= efx->txq_entries);
+       if (likely(fill_level < efx->txq_stop_thresh)) {
+               smp_mb();
+               if (likely(!efx->loopback_selftest))
+                       netif_tx_start_queue(txq1->core_txq);
+       }
+}
+
 /*
  * Add a socket buffer to a TX queue
  *
@@ -151,7 +193,7 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
  * This function is split out from efx_hard_start_xmit to allow the
  * loopback test to direct packets via specific TX queues.
  *
- * Returns NETDEV_TX_OK or NETDEV_TX_BUSY
+ * Returns NETDEV_TX_OK.
  * You must hold netif_tx_lock() to call this function.
  */
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
@@ -160,12 +202,11 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
        struct device *dma_dev = &efx->pci_dev->dev;
        struct efx_tx_buffer *buffer;
        skb_frag_t *fragment;
-       unsigned int len, unmap_len = 0, fill_level, insert_ptr;
+       unsigned int len, unmap_len = 0, insert_ptr;
        dma_addr_t dma_addr, unmap_addr = 0;
        unsigned int dma_len;
        unsigned short dma_flags;
-       int q_space, i = 0;
-       netdev_tx_t rc = NETDEV_TX_OK;
+       int i = 0;
 
        EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
 
@@ -183,9 +224,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
                        return NETDEV_TX_OK;
        }
 
-       fill_level = tx_queue->insert_count - tx_queue->old_read_count;
-       q_space = efx->txq_entries - 1 - fill_level;
-
        /* Map for DMA.  Use dma_map_single rather than dma_map_page
         * since this is more efficient on machines with sparse
         * memory.
@@ -205,32 +243,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 
                /* Add to TX queue, splitting across DMA boundaries */
                do {
-                       if (unlikely(q_space-- <= 0)) {
-                               /* It might be that completions have
-                                * happened since the xmit path last
-                                * checked.  Update the xmit path's
-                                * copy of read_count.
-                                */
-                               netif_tx_stop_queue(tx_queue->core_txq);
-                               /* This memory barrier protects the
-                                * change of queue state from the access
-                                * of read_count. */
-                               smp_mb();
-                               tx_queue->old_read_count =
-                                       ACCESS_ONCE(tx_queue->read_count);
-                               fill_level = (tx_queue->insert_count
-                                             - tx_queue->old_read_count);
-                               q_space = efx->txq_entries - 1 - fill_level;
-                               if (unlikely(q_space-- <= 0)) {
-                                       rc = NETDEV_TX_BUSY;
-                                       goto unwind;
-                               }
-                               smp_mb();
-                               if (likely(!efx->loopback_selftest))
-                                       netif_tx_start_queue(
-                                               tx_queue->core_txq);
-                       }
-
                        insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
                        buffer = &tx_queue->buffer[insert_ptr];
                        efx_tsoh_free(tx_queue, buffer);
@@ -277,6 +289,8 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
        /* Pass off to hardware */
        efx_nic_push_buffers(tx_queue);
 
+       efx_tx_maybe_stop_queue(tx_queue);
+
        return NETDEV_TX_OK;
 
  dma_err:
@@ -288,7 +302,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
        /* Mark the packet as transmitted, and free the SKB ourselves */
        dev_kfree_skb_any(skb);
 
- unwind:
        /* Work backwards until we hit the original insert pointer value */
        while (tx_queue->insert_count != tx_queue->write_count) {
                unsigned int pkts_compl = 0, bytes_compl = 0;
@@ -309,7 +322,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
                                       DMA_TO_DEVICE);
        }
 
-       return rc;
+       return NETDEV_TX_OK;
 }
 
 /* Remove packets from the TX queue
@@ -448,6 +461,7 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 {
        unsigned fill_level;
        struct efx_nic *efx = tx_queue->efx;
+       struct efx_tx_queue *txq2;
        unsigned int pkts_compl = 0, bytes_compl = 0;
 
        EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask);
@@ -455,15 +469,18 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
        efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl);
        netdev_tx_completed_queue(tx_queue->core_txq, pkts_compl, bytes_compl);
 
-       /* See if we need to restart the netif queue.  This barrier
-        * separates the update of read_count from the test of the
-        * queue state. */
+       /* See if we need to restart the netif queue.  This memory
+        * barrier ensures that we write read_count (inside
+        * efx_dequeue_buffers()) before reading the queue status.
+        */
        smp_mb();
        if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) &&
            likely(efx->port_enabled) &&
            likely(netif_device_present(efx->net_dev))) {
-               fill_level = tx_queue->insert_count - tx_queue->read_count;
-               if (fill_level < EFX_TXQ_THRESHOLD(efx))
+               txq2 = efx_tx_queue_partner(tx_queue);
+               fill_level = max(tx_queue->insert_count - tx_queue->read_count,
+                                txq2->insert_count - txq2->read_count);
+               if (fill_level <= efx->txq_wake_thresh)
                        netif_tx_wake_queue(tx_queue->core_txq);
        }
 
@@ -776,47 +793,19 @@ efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, struct efx_tso_header *tsoh)
  * @len:               Length of fragment
  * @final_buffer:      The final buffer inserted into the queue
  *
- * Push descriptors onto the TX queue.  Return 0 on success or 1 if
- * @tx_queue full.
+ * Push descriptors onto the TX queue.
  */
-static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
-                              dma_addr_t dma_addr, unsigned len,
-                              struct efx_tx_buffer **final_buffer)
+static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
+                               dma_addr_t dma_addr, unsigned len,
+                               struct efx_tx_buffer **final_buffer)
 {
        struct efx_tx_buffer *buffer;
        struct efx_nic *efx = tx_queue->efx;
-       unsigned dma_len, fill_level, insert_ptr;
-       int q_space;
+       unsigned dma_len, insert_ptr;
 
        EFX_BUG_ON_PARANOID(len <= 0);
 
-       fill_level = tx_queue->insert_count - tx_queue->old_read_count;
-       /* -1 as there is no way to represent all descriptors used */
-       q_space = efx->txq_entries - 1 - fill_level;
-
        while (1) {
-               if (unlikely(q_space-- <= 0)) {
-                       /* It might be that completions have happened
-                        * since the xmit path last checked.  Update
-                        * the xmit path's copy of read_count.
-                        */
-                       netif_tx_stop_queue(tx_queue->core_txq);
-                       /* This memory barrier protects the change of
-                        * queue state from the access of read_count. */
-                       smp_mb();
-                       tx_queue->old_read_count =
-                               ACCESS_ONCE(tx_queue->read_count);
-                       fill_level = (tx_queue->insert_count
-                                     - tx_queue->old_read_count);
-                       q_space = efx->txq_entries - 1 - fill_level;
-                       if (unlikely(q_space-- <= 0)) {
-                               *final_buffer = NULL;
-                               return 1;
-                       }
-                       smp_mb();
-                       netif_tx_start_queue(tx_queue->core_txq);
-               }
-
                insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
                buffer = &tx_queue->buffer[insert_ptr];
                ++tx_queue->insert_count;
@@ -847,7 +836,6 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
        EFX_BUG_ON_PARANOID(!len);
        buffer->len = len;
        *final_buffer = buffer;
-       return 0;
 }
 
 
@@ -975,20 +963,19 @@ static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx,
  * @st:                        TSO state
  *
  * Form descriptors for the current fragment, until we reach the end
- * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
- * space in @tx_queue.
+ * of fragment or end-of-packet.
  */
-static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
-                                        const struct sk_buff *skb,
-                                        struct tso_state *st)
+static void tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
+                                         const struct sk_buff *skb,
+                                         struct tso_state *st)
 {
        struct efx_tx_buffer *buffer;
-       int n, rc;
+       int n;
 
        if (st->in_len == 0)
-               return 0;
+               return;
        if (st->packet_space == 0)
-               return 0;
+               return;
 
        EFX_BUG_ON_PARANOID(st->in_len <= 0);
        EFX_BUG_ON_PARANOID(st->packet_space <= 0);
@@ -999,26 +986,24 @@ static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
        st->out_len -= n;
        st->in_len -= n;
 
-       rc = efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
-       if (likely(rc == 0)) {
-               if (st->out_len == 0) {
-                       /* Transfer ownership of the skb */
-                       buffer->skb = skb;
-                       buffer->flags = EFX_TX_BUF_SKB;
-               } else if (st->packet_space != 0) {
-                       buffer->flags = EFX_TX_BUF_CONT;
-               }
+       efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
 
-               if (st->in_len == 0) {
-                       /* Transfer ownership of the DMA mapping */
-                       buffer->unmap_len = st->unmap_len;
-                       buffer->flags |= st->dma_flags;
-                       st->unmap_len = 0;
-               }
+       if (st->out_len == 0) {
+               /* Transfer ownership of the skb */
+               buffer->skb = skb;
+               buffer->flags = EFX_TX_BUF_SKB;
+       } else if (st->packet_space != 0) {
+               buffer->flags = EFX_TX_BUF_CONT;
+       }
+
+       if (st->in_len == 0) {
+               /* Transfer ownership of the DMA mapping */
+               buffer->unmap_len = st->unmap_len;
+               buffer->flags |= st->dma_flags;
+               st->unmap_len = 0;
        }
 
        st->dma_addr += n;
-       return rc;
 }
 
 
@@ -1112,13 +1097,13 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
  *
  * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if
  * @skb was not enqueued.  In all cases @skb is consumed.  Return
- * %NETDEV_TX_OK or %NETDEV_TX_BUSY.
+ * %NETDEV_TX_OK.
  */
 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
                               struct sk_buff *skb)
 {
        struct efx_nic *efx = tx_queue->efx;
-       int frag_i, rc, rc2 = NETDEV_TX_OK;
+       int frag_i, rc;
        struct tso_state state;
 
        /* Find the packet protocol and sanity-check it */
@@ -1150,11 +1135,7 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
                goto mem_err;
 
        while (1) {
-               rc = tso_fill_packet_with_fragment(tx_queue, skb, &state);
-               if (unlikely(rc)) {
-                       rc2 = NETDEV_TX_BUSY;
-                       goto unwind;
-               }
+               tso_fill_packet_with_fragment(tx_queue, skb, &state);
 
                /* Move onto the next fragment? */
                if (state.in_len == 0) {
@@ -1178,6 +1159,8 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
        /* Pass off to hardware */
        efx_nic_push_buffers(tx_queue);
 
+       efx_tx_maybe_stop_queue(tx_queue);
+
        tx_queue->tso_bursts++;
        return NETDEV_TX_OK;
 
@@ -1186,7 +1169,6 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
                  "Out of memory for TSO headers, or DMA mapping error\n");
        dev_kfree_skb_any(skb);
 
- unwind:
        /* Free the DMA mapping we were in the process of writing out */
        if (state.unmap_len) {
                if (state.dma_flags & EFX_TX_BUF_MAP_SINGLE)
@@ -1198,7 +1180,7 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
        }
 
        efx_enqueue_unwind(tx_queue);
-       return rc2;
+       return NETDEV_TX_OK;
 }