#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _UAPI_ASM_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _UAPI__ASM_AVR32_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _ASM_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _ASM_IA64_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _ASM_M32R_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _UAPI_ASM_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _ASM_SOCKET_H */
#define SO_ATTACH_BPF 0x402B
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 0x402C
+#define SO_ATTACH_REUSEPORT_EBPF 0x402D
+
#endif /* _UAPI_ASM_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _ASM_POWERPC_SOCKET_H */
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _ASM_SOCKET_H */
#define SO_ATTACH_BPF 0x0034
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 0x0035
+#define SO_ATTACH_REUSEPORT_EBPF 0x0036
+
/* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* _XTENSA_SOCKET_H */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
+int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
+int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
unsigned int len);
#ifndef _SOCK_REUSEPORT_H
#define _SOCK_REUSEPORT_H
+#include <linux/filter.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/sock.h>
u16 max_socks; /* length of socks */
u16 num_socks; /* elements in socks */
+ struct bpf_prog __rcu *prog; /* optional BPF sock selector */
struct sock *socks[0]; /* array of sock pointers */
};
extern int reuseport_alloc(struct sock *sk);
extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2);
extern void reuseport_detach_sock(struct sock *sk);
-extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash);
+extern struct sock *reuseport_select_sock(struct sock *sk,
+ u32 hash,
+ struct sk_buff *skb,
+ int hdr_len);
+extern struct bpf_prog *reuseport_attach_prog(struct sock *sk,
+ struct bpf_prog *prog);
#endif /* _SOCK_REUSEPORT_H */
__be32 daddr, __be16 dport, int dif);
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif,
- struct udp_table *tbl);
+ struct udp_table *tbl, struct sk_buff *skb);
struct sock *udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
- int dif, struct udp_table *tbl);
+ int dif, struct udp_table *tbl,
+ struct sk_buff *skb);
/*
* SNMP statistics for UDP and UDP-Lite
#define SO_ATTACH_BPF 50
#define SO_DETACH_BPF SO_DETACH_FILTER
+#define SO_ATTACH_REUSEPORT_CBPF 51
+#define SO_ATTACH_REUSEPORT_EBPF 52
+
#endif /* __ASM_GENERIC_SOCKET_H */
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
+#include <net/sock_reuseport.h>
/**
* sk_filter - run a packet through a socket filter
return 0;
}
-/**
- * sk_attach_filter - attach a socket filter
- * @fprog: the filter program
- * @sk: the socket to use
- *
- * Attach the user's filter code. We first run some sanity checks on
- * it to make sure it does not explode on us later. If an error
- * occurs or there is insufficient memory for the filter a negative
- * errno code is returned. On success the return is zero.
- */
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+ struct bpf_prog *old_prog;
+ int err;
+
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ return -ENOMEM;
+
+ if (sk_unhashed(sk)) {
+ err = reuseport_alloc(sk);
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
+ old_prog = reuseport_attach_prog(sk, prog);
+ if (old_prog)
+ bpf_prog_destroy(old_prog);
+
+ return 0;
+}
+
+static
+struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
unsigned int fsize = bpf_classic_proglen(fprog);
unsigned int bpf_fsize = bpf_prog_size(fprog->len);
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
- return -EPERM;
+ return ERR_PTR(-EPERM);
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
prog = bpf_prog_alloc(bpf_fsize, 0);
if (!prog)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
__bpf_prog_free(prog);
- return -EFAULT;
+ return ERR_PTR(-EFAULT);
}
prog->len = fprog->len;
err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
__bpf_prog_free(prog);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- prog = bpf_prepare_filter(prog, NULL);
+ return bpf_prepare_filter(prog, NULL);
+}
+
+/**
+ * sk_attach_filter - attach a socket filter
+ * @fprog: the filter program
+ * @sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_filter(fprog, sk);
+ int err;
+
if (IS_ERR(prog))
return PTR_ERR(prog);
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
-int sk_attach_bpf(u32 ufd, struct sock *sk)
+int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
- struct bpf_prog *prog;
+ struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __reuseport_attach_prog(prog, sk);
+ if (err < 0) {
+ __bpf_prog_release(prog);
+ return err;
+ }
+
+ return 0;
+}
+
+static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog;
+
if (sock_flag(sk, SOCK_FILTER_LOCKED))
- return -EPERM;
+ return ERR_PTR(-EPERM);
prog = bpf_prog_get(ufd);
if (IS_ERR(prog))
- return PTR_ERR(prog);
+ return prog;
if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(prog);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
+ return prog;
+}
+
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
err = __sk_attach_prog(prog, sk);
if (err < 0) {
bpf_prog_put(prog);
return 0;
}
+int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __reuseport_attach_prog(prog, sk);
+ if (err < 0) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+}
+
#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
#define BPF_LDST_LEN 16U
#include <linux/sock_diag.h>
#include <linux/filter.h>
+#include <net/sock_reuseport.h>
#include <trace/events/sock.h>
}
break;
+ case SO_ATTACH_REUSEPORT_CBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_reuseport_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_ATTACH_REUSEPORT_EBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_reuseport_attach_bpf(ufd, sk);
+ }
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
/*
* To speed up listener socket lookup, create an array to store all sockets
* listening on the same port. This allows a decision to be made after finding
- * the first socket.
+ * the first socket. An optional BPF program can also be configured for
+ * selecting the socket index from the array of available sockets.
*/
#include <net/sock_reuseport.h>
+#include <linux/bpf.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
reuse->max_socks = max_socks;
+ RCU_INIT_POINTER(reuse->prog, NULL);
return reuse;
}
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
+ more_reuse->prog = reuse->prog;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
+ /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
+ * that reuse and more_reuse can temporarily share a reference
+ * to prog.
+ */
kfree_rcu(reuse, rcu);
return more_reuse;
}
}
EXPORT_SYMBOL(reuseport_add_sock);
+static void reuseport_free_rcu(struct rcu_head *head)
+{
+ struct sock_reuseport *reuse;
+
+ reuse = container_of(head, struct sock_reuseport, rcu);
+ if (reuse->prog)
+ bpf_prog_destroy(reuse->prog);
+ kfree(reuse);
+}
+
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
if (reuse->num_socks == 0)
- kfree_rcu(reuse, rcu);
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
break;
}
}
}
EXPORT_SYMBOL(reuseport_detach_sock);
+static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sk_buff *nskb = NULL;
+ u32 index;
+
+ if (skb_shared(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+ skb = nskb;
+ }
+
+ /* temporarily advance data past protocol header */
+ if (!pskb_pull(skb, hdr_len)) {
+ consume_skb(nskb);
+ return NULL;
+ }
+ index = bpf_prog_run_save_cb(prog, skb);
+ __skb_push(skb, hdr_len);
+
+ consume_skb(nskb);
+
+ if (index >= socks)
+ return NULL;
+
+ return reuse->socks[index];
+}
+
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
- * @hash: Use this hash to select.
+ * @hash: When no BPF filter is available, use this hash to select.
+ * @skb: skb to run through BPF filter.
+ * @hdr_len: BPF filter expects skb data pointer at payload data. If
+ * the skb does not yet point at the payload, this parameter represents
+ * how far the pointer needs to advance to reach the payload.
* Returns a socket that should receive the packet (or NULL on error).
*/
-struct sock *reuseport_select_sock(struct sock *sk, u32 hash)
+struct sock *reuseport_select_sock(struct sock *sk,
+ u32 hash,
+ struct sk_buff *skb,
+ int hdr_len)
{
struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
struct sock *sk2 = NULL;
u16 socks;
if (!reuse)
goto out;
+ prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
- sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+ if (prog && skb)
+ sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ else
+ sk2 = reuse->socks[reciprocal_scale(hash, socks)];
}
out:
return sk2;
}
EXPORT_SYMBOL(reuseport_select_sock);
+
+struct bpf_prog *
+reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(reuse->prog, prog);
+ spin_unlock_bh(&reuseport_lock);
+
+ return old_prog;
+}
+EXPORT_SYMBOL(reuseport_attach_prog);
struct sock *sk2;
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- sk2 = reuseport_select_sock(sk, hash);
+ sk2 = reuseport_select_sock(sk, hash, NULL, 0);
if (sk2) {
result = sk2;
goto found;
*/
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
- int dif, struct udp_table *udptable)
+ int dif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
struct sock *sk2;
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- sk2 = reuseport_select_sock(sk, hash);
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
if (sk2) {
result = sk2;
goto found;
return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- udptable);
+ udptable, skb);
}
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
- &udp_table);
+ &udp_table, NULL);
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, udptable);
+ iph->saddr, uh->source, skb->dev->ifindex, udptable,
+ NULL);
if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
- req->id.idiag_if, tbl);
+ req->id.idiag_if, tbl, NULL);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6)
sk = __udp6_lib_lookup(net,
req->id.idiag_sport,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
- req->id.idiag_if, tbl);
+ req->id.idiag_if, tbl, NULL);
#endif
else
goto out_nosk;
struct sock *sk2;
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
- sk2 = reuseport_select_sock(sk, hash);
+ sk2 = reuseport_select_sock(sk, hash, NULL, 0);
if (sk2) {
result = sk2;
goto found;
struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
- int dif, struct udp_table *udptable)
+ int dif, struct udp_table *udptable,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
struct sock *sk2;
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
- sk2 = reuseport_select_sock(sk, hash);
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
if (sk2) {
result = sk2;
goto found;
return sk;
return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- udptable);
+ udptable, skb);
}
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
- return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+ return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL);
}
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
struct net *net = dev_net(skb->dev);
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
- inet6_iif(skb), udptable);
+ inet6_iif(skb), udptable, skb);
if (!sk) {
ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);