[TCP]: Fix lost_retrans loop vs fastpath problems
authorIlpo Järvinen <ilpo.jarvinen@helsinki.fi>
Fri, 12 Oct 2007 00:35:41 +0000 (17:35 -0700)
committerDavid S. Miller <davem@davemloft.net>
Fri, 12 Oct 2007 00:35:41 +0000 (17:35 -0700)
Detection implemented with lost_retrans must work also when
fastpath is taken, yet most of the queue is skipped including
(very likely) those retransmitted skb's we're interested in.
This problem appeared when the hints got added, which removed
a need to always walk over the whole write queue head.
Therefore decicion for the lost_retrans worker loop entry must
be separated from the sacktag processing more than it was
necessary before.

It turns out to be problematic to optimize the worker loop
very heavily because ack_seqs of skb may have a number of
discontinuity points. Maybe similar approach as currently is
implemented could be attempted but that's becoming more and
more complex because the trend is towards less skb walking
in sacktag marker. Trying a simple work until all rexmitted
skbs heve been processed approach.

Maybe after(highest_sack_end_seq, tp->high_seq) checking is not
sufficiently accurate and causes entry too often in no-work-to-do
cases. Since that's not known, I've separated solution to that
from this patch.

Noticed because of report against a related problem from TAKANO
Ryousei <takano@axe-inc.co.jp>. He also provided a patch to
that part of the problem. This patch includes solution to it
(though this patch has to use somewhat different placement).
TAKANO's description and patch is available here:

  http://marc.info/?l=linux-netdev&m=119149311913288&w=2

...In short, TAKANO's problem is that end_seq the loop is using
not necessarily the largest SACK block's end_seq because the
current ACK may still have higher SACK blocks which are later
by the loop.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/ipv4/tcp_input.c

index a0660391213751360d9a941d5f0a5856dc2f1900..d5e0fcc22a3be1ff1435bf84accfc81db4b0f6d1 100644 (file)
@@ -1109,27 +1109,34 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
 /* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
  * Event "C". Later note: FACK people cheated me again 8), we have to account
  * for reordering! Ugly, but should help.
+ *
+ * Search retransmitted skbs from write_queue that were sent when snd_nxt was
+ * less than what is now known to be received by the other end (derived from
+ * SACK blocks by the caller).
  */
-static int tcp_mark_lost_retrans(struct sock *sk, u32 lost_retrans)
+static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int flag = 0;
+       int cnt = 0;
 
        tcp_for_write_queue(skb, sk) {
                u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
 
                if (skb == tcp_send_head(sk))
                        break;
-               if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
+               if (cnt == tp->retrans_out)
                        break;
                if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
                        continue;
 
-               if ((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
-                   after(lost_retrans, ack_seq) &&
+               if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
+                       continue;
+
+               if (after(received_upto, ack_seq) &&
                    (tcp_is_fack(tp) ||
-                    !before(lost_retrans,
+                    !before(received_upto,
                             ack_seq + tp->reordering * tp->mss_cache))) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                        tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1143,6 +1150,8 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 lost_retrans)
                                flag |= FLAG_DATA_SACKED;
                                NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
                        }
+               } else {
+                       cnt += tcp_skb_pcount(skb);
                }
        }
        return flag;
@@ -1225,7 +1234,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
        int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
        int reord = tp->packets_out;
        int prior_fackets;
-       u32 lost_retrans = 0;
+       u32 highest_sack_end_seq = 0;
        int flag = 0;
        int found_dup_sack = 0;
        int cached_fack_count;
@@ -1396,11 +1405,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                                continue;
                        }
 
-                       if ((sacked&TCPCB_SACKED_RETRANS) &&
-                           after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
-                           (!lost_retrans || after(end_seq, lost_retrans)))
-                               lost_retrans = end_seq;
-
                        if (!in_sack)
                                continue;
 
@@ -1454,9 +1458,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                                if (fack_count > tp->fackets_out)
                                        tp->fackets_out = fack_count;
 
-                               if (after(TCP_SKB_CB(skb)->seq,
-                                   tp->highest_sack))
+                               if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) {
                                        tp->highest_sack = TCP_SKB_CB(skb)->seq;
+                                       highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq;
+                               }
                        } else {
                                if (dup_sack && (sacked&TCPCB_RETRANS))
                                        reord = min(fack_count, reord);
@@ -1476,8 +1481,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                }
        }
 
-       if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery)
-               flag |= tcp_mark_lost_retrans(sk, lost_retrans);
+       if (tp->retrans_out && highest_sack_end_seq &&
+           after(highest_sack_end_seq, tp->high_seq) &&
+           icsk->icsk_ca_state == TCP_CA_Recovery)
+               flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq);
 
        tcp_verify_left_out(tp);