igbvf: Make next_to_watch a pointer and adjust memory barriers to avoid races
authorAlexander Duyck <alexander.h.duyck@intel.com>
Thu, 31 Jan 2013 07:15:51 +0000 (07:15 +0000)
committerJeff Kirsher <jeffrey.t.kirsher@intel.com>
Sat, 16 Feb 2013 05:46:51 +0000 (21:46 -0800)
This change is meant to address several race issues that become possible
because next_to_watch could possibly be set to a value that shows that the
descriptor is done when it is not.  In order to correct that we instead make
next_to_watch a pointer that is set to NULL during cleanup, and set to the
eop_desc after the descriptor rings have been written.

To enforce proper ordering the next_to_watch pointer is not set until after
a wmb writing the values to the last descriptor in a transmit.  In order to
guarantee that the descriptor is not read until after the eop_desc we use the
read_barrier_depends which is only really necessary on the alpha architecture.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
drivers/net/ethernet/intel/igbvf/igbvf.h
drivers/net/ethernet/intel/igbvf/netdev.c

index fdca7b6727764fb574ffc88a85f47ee81fdc5f23..a1463e3d14c0522511c75c424ffa16bd3162411a 100644 (file)
@@ -127,8 +127,8 @@ struct igbvf_buffer {
                /* Tx */
                struct {
                        unsigned long time_stamp;
+                       union e1000_adv_tx_desc *next_to_watch;
                        u16 length;
-                       u16 next_to_watch;
                        u16 mapped_as_page;
                };
                /* Rx */
index f53f7136e508955ab7a16f4131a2818111de03c8..d60cd43934153df96f45a164c05043e1b36a0dd9 100644 (file)
@@ -797,20 +797,31 @@ static bool igbvf_clean_tx_irq(struct igbvf_ring *tx_ring)
        struct sk_buff *skb;
        union e1000_adv_tx_desc *tx_desc, *eop_desc;
        unsigned int total_bytes = 0, total_packets = 0;
-       unsigned int i, eop, count = 0;
+       unsigned int i, count = 0;
        bool cleaned = false;
 
        i = tx_ring->next_to_clean;
-       eop = tx_ring->buffer_info[i].next_to_watch;
-       eop_desc = IGBVF_TX_DESC_ADV(*tx_ring, eop);
+       buffer_info = &tx_ring->buffer_info[i];
+       eop_desc = buffer_info->next_to_watch;
+
+       do {
+               /* if next_to_watch is not set then there is no work pending */
+               if (!eop_desc)
+                       break;
+
+               /* prevent any other reads prior to eop_desc */
+               read_barrier_depends();
+
+               /* if DD is not set pending work has not been completed */
+               if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)))
+                       break;
+
+               /* clear next_to_watch to prevent false hangs */
+               buffer_info->next_to_watch = NULL;
 
-       while ((eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)) &&
-              (count < tx_ring->count)) {
-               rmb();  /* read buffer_info after eop_desc status */
                for (cleaned = false; !cleaned; count++) {
                        tx_desc = IGBVF_TX_DESC_ADV(*tx_ring, i);
-                       buffer_info = &tx_ring->buffer_info[i];
-                       cleaned = (i == eop);
+                       cleaned = (tx_desc == eop_desc);
                        skb = buffer_info->skb;
 
                        if (skb) {
@@ -831,10 +842,12 @@ static bool igbvf_clean_tx_irq(struct igbvf_ring *tx_ring)
                        i++;
                        if (i == tx_ring->count)
                                i = 0;
+
+                       buffer_info = &tx_ring->buffer_info[i];
                }
-               eop = tx_ring->buffer_info[i].next_to_watch;
-               eop_desc = IGBVF_TX_DESC_ADV(*tx_ring, eop);
-       }
+
+               eop_desc = buffer_info->next_to_watch;
+       } while (count < tx_ring->count);
 
        tx_ring->next_to_clean = i;
 
@@ -1961,7 +1974,6 @@ static int igbvf_tso(struct igbvf_adapter *adapter,
        context_desc->seqnum_seed = 0;
 
        buffer_info->time_stamp = jiffies;
-       buffer_info->next_to_watch = i;
        buffer_info->dma = 0;
        i++;
        if (i == tx_ring->count)
@@ -2021,7 +2033,6 @@ static inline bool igbvf_tx_csum(struct igbvf_adapter *adapter,
                context_desc->mss_l4len_idx = 0;
 
                buffer_info->time_stamp = jiffies;
-               buffer_info->next_to_watch = i;
                buffer_info->dma = 0;
                i++;
                if (i == tx_ring->count)
@@ -2061,8 +2072,7 @@ static int igbvf_maybe_stop_tx(struct net_device *netdev, int size)
 
 static inline int igbvf_tx_map_adv(struct igbvf_adapter *adapter,
                                    struct igbvf_ring *tx_ring,
-                                   struct sk_buff *skb,
-                                   unsigned int first)
+                                  struct sk_buff *skb)
 {
        struct igbvf_buffer *buffer_info;
        struct pci_dev *pdev = adapter->pdev;
@@ -2077,7 +2087,6 @@ static inline int igbvf_tx_map_adv(struct igbvf_adapter *adapter,
        buffer_info->length = len;
        /* set time_stamp *before* dma to help avoid a possible race */
        buffer_info->time_stamp = jiffies;
-       buffer_info->next_to_watch = i;
        buffer_info->mapped_as_page = false;
        buffer_info->dma = dma_map_single(&pdev->dev, skb->data, len,
                                          DMA_TO_DEVICE);
@@ -2100,7 +2109,6 @@ static inline int igbvf_tx_map_adv(struct igbvf_adapter *adapter,
                BUG_ON(len >= IGBVF_MAX_DATA_PER_TXD);
                buffer_info->length = len;
                buffer_info->time_stamp = jiffies;
-               buffer_info->next_to_watch = i;
                buffer_info->mapped_as_page = true;
                buffer_info->dma = skb_frag_dma_map(&pdev->dev, frag, 0, len,
                                                DMA_TO_DEVICE);
@@ -2109,7 +2117,6 @@ static inline int igbvf_tx_map_adv(struct igbvf_adapter *adapter,
        }
 
        tx_ring->buffer_info[i].skb = skb;
-       tx_ring->buffer_info[first].next_to_watch = i;
 
        return ++count;
 
@@ -2120,7 +2127,6 @@ dma_error:
        buffer_info->dma = 0;
        buffer_info->time_stamp = 0;
        buffer_info->length = 0;
-       buffer_info->next_to_watch = 0;
        buffer_info->mapped_as_page = false;
        if (count)
                count--;
@@ -2139,7 +2145,8 @@ dma_error:
 
 static inline void igbvf_tx_queue_adv(struct igbvf_adapter *adapter,
                                       struct igbvf_ring *tx_ring,
-                                      int tx_flags, int count, u32 paylen,
+                                     int tx_flags, int count,
+                                     unsigned int first, u32 paylen,
                                       u8 hdr_len)
 {
        union e1000_adv_tx_desc *tx_desc = NULL;
@@ -2189,6 +2196,7 @@ static inline void igbvf_tx_queue_adv(struct igbvf_adapter *adapter,
         * such as IA-64). */
        wmb();
 
+       tx_ring->buffer_info[first].next_to_watch = tx_desc;
        tx_ring->next_to_use = i;
        writel(i, adapter->hw.hw_addr + tx_ring->tail);
        /* we need this if more than one processor can write to our tail
@@ -2255,11 +2263,11 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb,
         * count reflects descriptors mapped, if 0 then mapping error
         * has occurred and we need to rewind the descriptor queue
         */
-       count = igbvf_tx_map_adv(adapter, tx_ring, skb, first);
+       count = igbvf_tx_map_adv(adapter, tx_ring, skb);
 
        if (count) {
                igbvf_tx_queue_adv(adapter, tx_ring, tx_flags, count,
-                                  skb->len, hdr_len);
+                                  first, skb->len, hdr_len);
                /* Make sure there is space in the ring for the next send. */
                igbvf_maybe_stop_tx(netdev, MAX_SKB_FRAGS + 4);
        } else {