Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit
0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
* when the socket is uncorked.
*/
__u16 len; /* total length of pending frames */
+ __u16 gso_size;
/*
* Fields specific to UDP-Lite.
*/
int forward_deficit;
};
+#define UDP_MAX_SEGMENTS (1 << 6UL)
+
static inline struct udp_sock *udp_sk(const struct sock *sk)
{
return (struct udp_sock *)sk;
__u8 ttl;
__s16 tos;
char priority;
+ __u16 gso_size;
};
struct inet_cork_full {
__u8 ttl;
__s16 tos;
char priority;
+ __u16 gso_size;
};
#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
__s16 tclass;
__s8 dontfrag;
struct ipv6_txoptions *opt;
+ __u16 gso_size;
};
static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
+#define UDP_SEGMENT 103 /* Set GSO segmentation size */
/* UDP encapsulation types */
#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
- !(flags & MSG_MORE) &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
*rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+ cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
}
EXPORT_SYMBOL(udp_set_csum);
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+ struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
uh->len = htons(len);
uh->check = 0;
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + cork->gso_size > cork->fragsize)
+ return -EINVAL;
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ return -EINVAL;
+ if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+ return -EIO;
+
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ }
+
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
if (!skb)
goto out;
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &inet->cork.base);
out:
up->len = 0;
ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
+ ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
&cork, msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &cork);
goto out;
}
up->no_check6_rx = valbool;
break;
+ case UDP_SEGMENT:
+ if (val < 0 || val > USHRT_MAX)
+ return -EINVAL;
+ up->gso_size = val;
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
val = up->no_check6_rx;
break;
+ case UDP_SEGMENT:
+ val = up->gso_size;
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
if (mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
+ cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
}
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
length <= mtu - headersize &&
- !(flags & MSG_MORE) &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
* Sending
*/
-static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+ struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct udphdr *uh;
uh->len = htons(len);
uh->check = 0;
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + cork->gso_size > cork->fragsize)
+ return -EINVAL;
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ return -EINVAL;
+ if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+ return -EIO;
+
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ }
+
if (is_udplite)
csum = udplite_csum(skb);
else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
if (!skb)
goto out;
- err = udp_v6_send_skb(skb, &fl6);
+ err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
out:
up->len = 0;
ipc6.hlimit = -1;
ipc6.tclass = -1;
ipc6.dontfrag = -1;
+ ipc6.gso_size = up->gso_size;
sockc.tsflags = sk->sk_tsflags;
/* destination address check */
msg->msg_flags, &cork, &sockc);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_v6_send_skb(skb, &fl6);
+ err = udp_v6_send_skb(skb, &fl6, &cork.base);
goto out;
}