ipv6: Implement automatic flow label generation on transmit
authorTom Herbert <therbert@google.com>
Wed, 2 Jul 2014 04:33:10 +0000 (21:33 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 8 Jul 2014 04:14:21 +0000 (21:14 -0700)
Automatically generate flow labels for IPv6 packets on transmit.
The flow label is computed based on skb_get_hash. The flow label will
only automatically be set when it is zero otherwise (i.e. flow label
manager hasn't set one). This supports the transmit side functionality
of RFC 6438.

Added an IPv6 sysctl auto_flowlabels to enable/disable this behavior
system wide, and added IPV6_AUTOFLOWLABEL socket option to enable this
functionality per socket.

By default, auto flowlabels are disabled to avoid possible conflicts
with flow label manager, however if this feature proves useful we
may want to enable it by default.

It should also be noted that FreeBSD has already implemented automatic
flow labels (including the sysctl and socket option). In FreeBSD,
automatic flow labels default to enabled.

Performance impact:

Running super_netperf with 200 flows for TCP_RR and UDP_RR for
IPv6. Note that in UDP case, __skb_get_hash will be called for
every packet with explains slight regression. In the TCP case
the hash is saved in the socket so there is no regression.

Automatic flow labels disabled:

  TCP_RR:
    86.53% CPU utilization
    127/195/322 90/95/99% latencies
    1.40498e+06 tps

  UDP_RR:
    90.70% CPU utilization
    118/168/243 90/95/99% latencies
    1.50309e+06 tps

Automatic flow labels enabled:

  TCP_RR:
    85.90% CPU utilization
    128/199/337 90/95/99% latencies
    1.40051e+06

  UDP_RR
    92.61% CPU utilization
    115/164/236 90/95/99% latencies
    1.4687e+06

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/networking/ip-sysctl.txt
include/linux/ipv6.h
include/net/ipv6.h
include/net/netns/ipv6.h
include/uapi/linux/in6.h
net/ipv6/af_inet6.c
net/ipv6/ip6_gre.c
net/ipv6/ip6_output.c
net/ipv6/ip6_tunnel.c
net/ipv6/ipv6_sockglue.c
net/ipv6/sysctl_net_ipv6.c

index 10e216c6e05e4bc1b385f957a8c75140c24687ae..f35bfe43bf7abac26690328c5332d154023fd818 100644 (file)
@@ -1132,6 +1132,15 @@ flowlabel_consistency - BOOLEAN
        FALSE: disabled
        Default: TRUE
 
+auto_flowlabels - BOOLEAN
+       Automatically generate flow labels based based on a flow hash
+       of the packet. This allows intermediate devices, such as routers,
+       to idenfify packet flows for mechanisms like Equal Cost Multipath
+       Routing (see RFC 6438).
+       TRUE: enabled
+       FALSE: disabled
+       Default: false
+
 anycast_src_echo_reply - BOOLEAN
        Controls the use of anycast addresses as source addresses for ICMPv6
        echo reply
index 5dc68c3ebcbdaebca5961314ff1557e216e91256..ff560537dd61b334a3dfbfb559dfa0f1cff20f41 100644 (file)
@@ -199,7 +199,8 @@ struct ipv6_pinfo {
                                                 * 010: prefer public address
                                                 * 100: prefer care-of address
                                                 */
-                               dontfrag:1;
+                               dontfrag:1,
+                               autoflowlabel:1;
        __u8                    min_hopcount;
        __u8                    tclass;
        __be32                  rcv_flowinfo;
index 2aa86e1135a1cc7e20370c680545ef2205263831..4308f2ada8b34ec8a5b83bbd7a73b6ade15092cb 100644 (file)
@@ -699,6 +699,26 @@ static inline void ip6_set_txhash(struct sock *sk)
        sk->sk_txhash = flow_hash_from_keys(&keys);
 }
 
+static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
+                                       __be32 flowlabel, bool autolabel)
+{
+       if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) {
+               __be32 hash;
+
+               hash = skb_get_hash(skb);
+
+               /* Since this is being sent on the wire obfuscate hash a bit
+                * to minimize possbility that any useful information to an
+                * attacker is leaked. Only lower 20 bits are relevant.
+                */
+               hash ^= hash >> 12;
+
+               flowlabel = hash & IPV6_FLOWLABEL_MASK;
+       }
+
+       return flowlabel;
+}
+
 /*
  *     Header manipulation
  */
index 19d3446e59d2555639e9553b1958d7354792af1e..eade27adecf3678ed2d1568adf34badd38ed88f9 100644 (file)
@@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 {
        int ip6_rt_mtu_expires;
        int ip6_rt_min_advmss;
        int flowlabel_consistency;
+       int auto_flowlabels;
        int icmpv6_time;
        int anycast_src_echo_reply;
        int fwmark_reflect;
index 0d8e0f0342dc183acd5393e11bfe203f019ca165..22b7a69619d87446904480342ede77cdb295164c 100644 (file)
@@ -233,6 +233,7 @@ struct in6_flowlabel_req {
 #if 0  /* not yet */
 #define IPV6_USE_MIN_MTU       63
 #endif
+#define IPV6_AUTOFLOWLABEL     64
 
 /*
  * Netfilter (1)
index a426cd7099bbba73831839f4f22395960114e99f..2daa3a133e498cdccfe5695ee62db7c72da8ce25 100644 (file)
@@ -765,6 +765,7 @@ static int __net_init inet6_net_init(struct net *net)
        net->ipv6.sysctl.bindv6only = 0;
        net->ipv6.sysctl.icmpv6_time = 1*HZ;
        net->ipv6.sysctl.flowlabel_consistency = 1;
+       net->ipv6.sysctl.auto_flowlabels = 0;
        atomic_set(&net->ipv6.rt_genid, 0);
 
        err = ipv6_init_mibs(net);
index 3873181ed85614a28f9857d7d53acf2be9d2b9fb..365b2b6f3942cdcdb8794bed6dcfa2e4477466b9 100644 (file)
@@ -723,7 +723,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
         *      Push down and install the IP header.
         */
        ipv6h = ipv6_hdr(skb);
-       ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel);
+       ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
+                    ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
        ipv6h->hop_limit = tunnel->parms.hop_limit;
        ipv6h->nexthdr = proto;
        ipv6h->saddr = fl6->saddr;
@@ -1174,7 +1175,9 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
        struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen);
        __be16 *p = (__be16 *)(ipv6h+1);
 
-       ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel);
+       ip6_flow_hdr(ipv6h, 0,
+                    ip6_make_flowlabel(dev_net(dev), skb,
+                                       t->fl.u.ip6.flowlabel, false));
        ipv6h->hop_limit = t->parms.hop_limit;
        ipv6h->nexthdr = NEXTHDR_GRE;
        ipv6h->saddr = t->parms.laddr;
index cb9df0eb40237065e696dd1013180b6c82a9c2dc..fa83bdd4c3dddd18a40eeea020bdad3cfd534162 100644 (file)
@@ -205,7 +205,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
        if (hlimit < 0)
                hlimit = ip6_dst_hoplimit(dst);
 
-       ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
+       ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
+                                                    np->autoflowlabel));
 
        hdr->payload_len = htons(seg_len);
        hdr->nexthdr = proto;
@@ -1569,7 +1570,9 @@ int ip6_push_pending_frames(struct sock *sk)
        skb_reset_network_header(skb);
        hdr = ipv6_hdr(skb);
 
-       ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
+       ip6_flow_hdr(hdr, np->cork.tclass,
+                    ip6_make_flowlabel(net, skb, fl6->flowlabel,
+                                       np->autoflowlabel));
        hdr->hop_limit = np->cork.hop_limit;
        hdr->nexthdr = proto;
        hdr->saddr = fl6->saddr;
index afa082458360216ff33012ee43e93354888e44b9..51a1eb185ea7b6e255ecdb454eb12ed83875739d 100644 (file)
@@ -1046,7 +1046,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
        skb_push(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        ipv6h = ipv6_hdr(skb);
-       ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel);
+       ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
+                    ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
        ipv6h->hop_limit = t->parms.hop_limit;
        ipv6h->nexthdr = proto;
        ipv6h->saddr = fl6->saddr;
index cc34f65179e4b771438439d976c03467dc493c24..b50b9e54cf531cf7ec4b327b650856b72ff677c5 100644 (file)
@@ -834,6 +834,10 @@ pref_skip_coa:
                np->dontfrag = valbool;
                retv = 0;
                break;
+       case IPV6_AUTOFLOWLABEL:
+               np->autoflowlabel = valbool;
+               retv = 0;
+               break;
        }
 
        release_sock(sk);
@@ -1273,6 +1277,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
                val = np->dontfrag;
                break;
 
+       case IPV6_AUTOFLOWLABEL:
+               val = np->autoflowlabel;
+               break;
+
        default:
                return -ENOPROTOOPT;
        }
index 058f3eca2e53efd1fe016cfe8450d3ab0a9c13b1..5bf7b61f8ae8bc9403b9ea8b705f73d09eaccac5 100644 (file)
@@ -38,6 +38,13 @@ static struct ctl_table ipv6_table_template[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+       {
+               .procname       = "auto_flowlabels",
+               .data           = &init_net.ipv6.sysctl.auto_flowlabels,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec
+       },
        {
                .procname       = "fwmark_reflect",
                .data           = &init_net.ipv6.sysctl.fwmark_reflect,
@@ -74,6 +81,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
        ipv6_table[0].data = &net->ipv6.sysctl.bindv6only;
        ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply;
        ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency;
+       ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels;
 
        ipv6_route_table = ipv6_route_sysctl_init(net);
        if (!ipv6_route_table)