ipv4: fix path MTU discovery with connection tracking
authorPatrick McHardy <kaber@trash.net>
Sun, 26 Aug 2012 17:13:55 +0000 (19:13 +0200)
committerPatrick McHardy <kaber@trash.net>
Sun, 26 Aug 2012 17:13:55 +0000 (19:13 +0200)
IPv4 conntrack defragments incoming packet at the PRE_ROUTING hook and
(in case of forwarded packets) refragments them at POST_ROUTING
independent of the IP_DF flag. Refragmentation uses the dst_mtu() of
the local route without caring about the original fragment sizes,
thereby breaking PMTUD.

This patch fixes this by keeping track of the largest received fragment
with IP_DF set and generates an ICMP fragmentation required error during
refragmentation if that size exceeds the MTU.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: David S. Miller <davem@davemloft.net>
include/net/inet_frag.h
include/net/ip.h
net/ipv4/ip_fragment.c
net/ipv4/ip_output.c

index 2431cf83aecafb74ca1e9464bec6f6a669046d75..5098ee7b7e0e752c836316c9e8ebd59d8597117e 100644 (file)
@@ -29,6 +29,8 @@ struct inet_frag_queue {
 #define INET_FRAG_COMPLETE     4
 #define INET_FRAG_FIRST_IN     2
 #define INET_FRAG_LAST_IN      1
+
+       u16                     max_size;
 };
 
 #define INETFRAGS_HASHSZ               64
index 5a5d84d3d2c6b6e3777035a631fb10e7479ab8de..0707fb9551aa4c1011c88969a42cd4482450d035 100644 (file)
@@ -42,6 +42,8 @@ struct inet_skb_parm {
 #define IPSKB_XFRM_TRANSFORMED 4
 #define IPSKB_FRAG_COMPLETE    8
 #define IPSKB_REROUTED         16
+
+       u16                     frag_max_size;
 };
 
 static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
index 8d07c973409ca3df9d09f8fb6b3a614d304603a8..fa6a12c51066135c6590ca8746b1a71596ea7ac6 100644 (file)
@@ -523,6 +523,10 @@ found:
        if (offset == 0)
                qp->q.last_in |= INET_FRAG_FIRST_IN;
 
+       if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+           skb->len + ihl > qp->q.max_size)
+               qp->q.max_size = skb->len + ihl;
+
        if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
            qp->q.meat == qp->q.len)
                return ip_frag_reasm(qp, prev, dev);
@@ -646,9 +650,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
        head->next = NULL;
        head->dev = dev;
        head->tstamp = qp->q.stamp;
+       IPCB(head)->frag_max_size = qp->q.max_size;
 
        iph = ip_hdr(head);
-       iph->frag_off = 0;
+       /* max_size != 0 implies at least one fragment had IP_DF set */
+       iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
        iph->tot_len = htons(len);
        iph->tos |= ecn;
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
index c196d749daf23b3823ffe012495ea5d9411be99a..a5beab1dc95850065a967b4ec3d2a7436326c255 100644 (file)
@@ -467,7 +467,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
        iph = ip_hdr(skb);
 
-       if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+       if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+                    (IPCB(skb)->frag_max_size &&
+                     IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                          htonl(ip_skb_dst_mtu(skb)));