bpf: sockmap, add hash map support

author John Fastabend <john.fastabend@gmail.com>

Mon, 14 May 2018 17:00:17 +0000 (10:00 -0700)

committer Daniel Borkmann <daniel@iogearbox.net>

Tue, 15 May 2018 18:41:03 +0000 (20:41 +0200)
author John Fastabend <john.fastabend@gmail.com>
Mon, 14 May 2018 17:00:17 +0000 (10:00 -0700)
committer Daniel Borkmann <daniel@iogearbox.net>
Tue, 15 May 2018 18:41:03 +0000 (20:41 +0200)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index a38e474bf7ee213a9a06b6ca8e5d01c628e09ca3..ed0122b45b633a46573cff9ad9c1ff0a7dbe7006 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
  
  #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
  struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
  int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
  #else
  static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
@@ -675,6 +676,12 @@ static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
         return NULL;
  }
  
+static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
+                                                   void *key)
+{
+       return NULL;
+}
+
  static inline int sock_map_prog(struct bpf_map *map,
                                 struct bpf_prog *prog,
                                 u32 type)
@@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
  extern const struct bpf_func_proto bpf_get_stackid_proto;
  extern const struct bpf_func_proto bpf_get_stack_proto;
  extern const struct bpf_func_proto bpf_sock_map_update_proto;
+extern const struct bpf_func_proto bpf_sock_hash_update_proto;
  
  /* Shared helpers among cBPF and eBPF. */
  void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h

index d7df1b32308269ac704ae078912dcdd3a543ad79..b67f8793de0d6083b8908f3794560363e67659fd 100644 (file)
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
  #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
  #endif
  BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
  #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 02e4112510f8e840b1f2bcd47a6acbe0d00931e9..d94d333a82259cbf07f6725445f7768b96a24419 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
         BPF_MAP_TYPE_SOCKMAP,
         BPF_MAP_TYPE_CPUMAP,
         BPF_MAP_TYPE_XSKMAP,
+       BPF_MAP_TYPE_SOCKHASH,
  };
  
  enum bpf_prog_type {
@@ -1828,7 +1829,6 @@ union bpf_attr {
   *     Return
   *             0 on success, or a negative error in case of failure.
   *
- *
   * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
   *     Description
   *             Do FIB lookup in kernel tables using parameters in *params*.
@@ -1855,6 +1855,53 @@ union bpf_attr {
   *             Egress device index on success, 0 if packet needs to continue
   *             up the stack for further processing or a negative error in case
   *             of failure.
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *     Description
+ *             Add an entry to, or update a sockhash *map* referencing sockets.
+ *             The *skops* is used as a new value for the entry associated to
+ *             *key*. *flags* is one of:
+ *
+ *             **BPF_NOEXIST**
+ *                     The entry for *key* must not exist in the map.
+ *             **BPF_EXIST**
+ *                     The entry for *key* must already exist in the map.
+ *             **BPF_ANY**
+ *                     No condition on the existence of the entry for *key*.
+ *
+ *             If the *map* has eBPF programs (parser and verdict), those will
+ *             be inherited by the socket being added. If the socket is
+ *             already attached to eBPF programs, this results in an error.
+ *     Return
+ *             0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *     Description
+ *             This helper is used in programs implementing policies at the
+ *             socket level. If the message *msg* is allowed to pass (i.e. if
+ *             the verdict eBPF program returns **SK_PASS**), redirect it to
+ *             the socket referenced by *map* (of type
+ *             **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *             egress interfaces can be used for redirection. The
+ *             **BPF_F_INGRESS** value in *flags* is used to make the
+ *             distinction (ingress path is selected if the flag is present,
+ *             egress path otherwise). This is the only flag supported for now.
+ *     Return
+ *             **SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *     Description
+ *             This helper is used in programs implementing policies at the
+ *             skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *             if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *             to the socket referenced by *map* (of type
+ *             **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *             egress interfaces can be used for redirection. The
+ *             **BPF_F_INGRESS** value in *flags* is used to make the
+ *             distinction (ingress path is selected if the flag is present,
+ *             egress otherwise). This is the only flag supported for now.
+ *     Return
+ *             **SK_PASS** on success, or **SK_DROP** on error.
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -1926,7 +1973,10 @@ union bpf_attr {
         FN(skb_get_xfrm_state),         \
         FN(get_stack),                  \
         FN(skb_load_bytes_relative),    \
-       FN(fib_lookup),
+       FN(fib_lookup),                 \
+       FN(sock_hash_update),           \
+       FN(msg_redirect_hash),          \
+       FN(sk_redirect_hash),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
   * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c

index d0d7d94623684b8698a67bba72b05c6de36ed35d..2194c6a9df42bd38ac631328bba9704c0b2d6999 100644 (file)
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1707,6 +1707,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
  const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
  const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
  
  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
  {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c

index beab9ec9b023bd9a64eff2fbeeef31cdafbf9ce6..56879c9fd3a462c1380586cc6b42c9d0faac32be 100644 (file)
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -60,6 +60,28 @@ struct bpf_stab {
         struct bpf_sock_progs progs;
  };
  
+struct bucket {
+       struct hlist_head head;
+       raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+       struct bpf_map map;
+       struct bucket *buckets;
+       atomic_t count;
+       u32 n_buckets;
+       u32 elem_size;
+       struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+       struct rcu_head rcu;
+       struct hlist_node hash_node;
+       u32 hash;
+       struct sock *sk;
+       char key[0];
+};
+
  enum smap_psock_state {
         SMAP_TX_RUNNING,
  };
@@ -67,6 +89,8 @@ enum smap_psock_state {
  struct smap_psock_map_entry {
         struct list_head list;
         struct sock **entry;
+       struct htab_elem *hash_link;
+       struct bpf_htab *htab;
  };
  
  struct smap_psock {
@@ -195,6 +219,12 @@ out:
         rcu_read_unlock();
  }
  
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+       atomic_dec(&htab->count);
+       kfree_rcu(l, rcu);
+}
+
  static void bpf_tcp_close(struct sock *sk, long timeout)
  {
         void (*close_fun)(struct sock *sk, long timeout);
@@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
         }
  
         list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-               osk = cmpxchg(e->entry, sk, NULL);
-               if (osk == sk) {
-                       list_del(&e->list);
-                       smap_release_sock(psock, sk);
+               if (e->entry) {
+                       osk = cmpxchg(e->entry, sk, NULL);
+                       if (osk == sk) {
+                               list_del(&e->list);
+                               smap_release_sock(psock, sk);
+                       }
+               } else {
+                       hlist_del_rcu(&e->hash_link->hash_node);
+                       smap_release_sock(psock, e->hash_link->sk);
+                       free_htab_elem(e->htab, e->hash_link);
                 }
         }
         write_unlock_bh(&sk->sk_callback_lock);
@@ -1527,12 +1563,14 @@ free_stab:
         return ERR_PTR(err);
  }
  
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+                            struct sock **entry,
+                            struct htab_elem *hash_link)
  {
         struct smap_psock_map_entry *e, *tmp;
  
         list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-               if (e->entry == entry) {
+               if (e->entry == entry || e->hash_link == hash_link) {
                         list_del(&e->list);
                         break;
                 }
@@ -1570,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map)
                  * to be null and queued for garbage collection.
                  */
                 if (likely(psock)) {
-                       smap_list_remove(psock, &stab->sock_map[i]);
+                       smap_list_remove(psock, &stab->sock_map[i], NULL);
                         smap_release_sock(psock, sock);
                 }
                 write_unlock_bh(&sock->sk_callback_lock);
@@ -1629,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
  
         if (psock->bpf_parse)
                 smap_stop_sock(psock, sock);
-       smap_list_remove(psock, &stab->sock_map[k]);
+       smap_list_remove(psock, &stab->sock_map[k], NULL);
         smap_release_sock(psock, sock);
  out:
         write_unlock_bh(&sock->sk_callback_lock);
@@ -1746,10 +1784,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
                 new = true;
         }
  
-       e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-       if (!e) {
-               err = -ENOMEM;
-               goto out_progs;
+       if (map_link) {
+               e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+               if (!e) {
+                       err = -ENOMEM;
+                       goto out_progs;
+               }
         }
  
         /* 3. At this point we have a reference to a valid psock that is
@@ -1783,6 +1823,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
         write_unlock_bh(&sock->sk_callback_lock);
         return err;
  out_free:
+       kfree(e);
         smap_release_sock(psock, sock);
  out_progs:
         if (verdict)
@@ -1829,7 +1870,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
                 struct smap_psock *opsock = smap_psock_sk(osock);
  
                 write_lock_bh(&osock->sk_callback_lock);
-               smap_list_remove(opsock, &stab->sock_map[i]);
+               smap_list_remove(opsock, &stab->sock_map[i], NULL);
                 smap_release_sock(opsock, osock);
                 write_unlock_bh(&osock->sk_callback_lock);
         }
@@ -1846,6 +1887,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
                 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  
                 progs = &stab->progs;
+       } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+               struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+               progs = &htab->progs;
         } else {
                 return -EINVAL;
         }
@@ -1906,11 +1951,19 @@ static int sock_map_update_elem(struct bpf_map *map,
  
  static void sock_map_release(struct bpf_map *map)
  {
-       struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
         struct bpf_sock_progs *progs;
         struct bpf_prog *orig;
  
-       progs = &stab->progs;
+       if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+               struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+               progs = &stab->progs;
+       } else {
+               struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+               progs = &htab->progs;
+       }
+
         orig = xchg(&progs->bpf_parse, NULL);
         if (orig)
                 bpf_prog_put(orig);
@@ -1923,6 +1976,390 @@ static void sock_map_release(struct bpf_map *map)
                 bpf_prog_put(orig);
  }
  
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+       struct bpf_htab *htab;
+       int i, err;
+       u64 cost;
+
+       if (!capable(CAP_NET_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       /* check sanity of attributes */
+       if (attr->max_entries == 0 || attr->value_size != 4 ||
+           attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+               return ERR_PTR(-EINVAL);
+
+       err = bpf_tcp_ulp_register();
+       if (err && err != -EEXIST)
+               return ERR_PTR(err);
+
+       htab = kzalloc(sizeof(*htab), GFP_USER);
+       if (!htab)
+               return ERR_PTR(-ENOMEM);
+
+       bpf_map_init_from_attr(&htab->map, attr);
+
+       htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+       htab->elem_size = sizeof(struct htab_elem) +
+                         round_up(htab->map.key_size, 8);
+       err = -EINVAL;
+       if (htab->n_buckets == 0 ||
+           htab->n_buckets > U32_MAX / sizeof(struct bucket))
+               goto free_htab;
+
+       cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+              (u64) htab->elem_size * htab->map.max_entries;
+
+       if (cost >= U32_MAX - PAGE_SIZE)
+               goto free_htab;
+
+       htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+       err = bpf_map_precharge_memlock(htab->map.pages);
+       if (err)
+               goto free_htab;
+
+       err = -ENOMEM;
+       htab->buckets = bpf_map_area_alloc(
+                               htab->n_buckets * sizeof(struct bucket),
+                               htab->map.numa_node);
+       if (!htab->buckets)
+               goto free_htab;
+
+       for (i = 0; i < htab->n_buckets; i++) {
+               INIT_HLIST_HEAD(&htab->buckets[i].head);
+               raw_spin_lock_init(&htab->buckets[i].lock);
+       }
+
+       return &htab->map;
+free_htab:
+       kfree(htab);
+       return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+       return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+       return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       int i;
+
+       synchronize_rcu();
+
+       /* At this point no update, lookup or delete operations can happen.
+        * However, be aware we can still get a socket state event updates,
+        * and data ready callabacks that reference the psock from sk_user_data
+        * Also psock worker threads are still in-flight. So smap_release_sock
+        * will only free the psock after cancel_sync on the worker threads
+        * and a grace period expire to ensure psock is really safe to remove.
+        */
+       rcu_read_lock();
+       for (i = 0; i < htab->n_buckets; i++) {
+               struct hlist_head *head = select_bucket(htab, i);
+               struct hlist_node *n;
+               struct htab_elem *l;
+
+               hlist_for_each_entry_safe(l, n, head, hash_node) {
+                       struct sock *sock = l->sk;
+                       struct smap_psock *psock;
+
+                       hlist_del_rcu(&l->hash_node);
+                       write_lock_bh(&sock->sk_callback_lock);
+                       psock = smap_psock_sk(sock);
+                       /* This check handles a racing sock event that can get
+                        * the sk_callback_lock before this case but after xchg
+                        * causing the refcnt to hit zero and sock user data
+                        * (psock) to be null and queued for garbage collection.
+                        */
+                       if (likely(psock)) {
+                               smap_list_remove(psock, NULL, l);
+                               smap_release_sock(psock, sock);
+                       }
+                       write_unlock_bh(&sock->sk_callback_lock);
+                       kfree(l);
+               }
+       }
+       rcu_read_unlock();
+       bpf_map_area_free(htab->buckets);
+       kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+                                             void *key, u32 key_size, u32 hash,
+                                             struct sock *sk,
+                                             struct htab_elem *old_elem)
+{
+       struct htab_elem *l_new;
+
+       if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+               if (!old_elem) {
+                       atomic_dec(&htab->count);
+                       return ERR_PTR(-E2BIG);
+               }
+       }
+       l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+                            htab->map.numa_node);
+       if (!l_new)
+               return ERR_PTR(-ENOMEM);
+
+       memcpy(l_new->key, key, key_size);
+       l_new->sk = sk;
+       l_new->hash = hash;
+       return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+                                        u32 hash, void *key, u32 key_size)
+{
+       struct htab_elem *l;
+
+       hlist_for_each_entry_rcu(l, head, hash_node) {
+               if (l->hash == hash && !memcmp(&l->key, key, key_size))
+                       return l;
+       }
+
+       return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+       return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+                                 void *key, void *next_key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct htab_elem *l, *next_l;
+       struct hlist_head *h;
+       u32 hash, key_size;
+       int i = 0;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       key_size = map->key_size;
+       if (!key)
+               goto find_first_elem;
+       hash = htab_map_hash(key, key_size);
+       h = select_bucket(htab, hash);
+
+       l = lookup_elem_raw(h, hash, key, key_size);
+       if (!l)
+               goto find_first_elem;
+       next_l = hlist_entry_safe(
+                    rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+                    struct htab_elem, hash_node);
+       if (next_l) {
+               memcpy(next_key, next_l->key, key_size);
+               return 0;
+       }
+
+       /* no more elements in this hash list, go to the next bucket */
+       i = hash & (htab->n_buckets - 1);
+       i++;
+
+find_first_elem:
+       /* iterate over buckets */
+       for (; i < htab->n_buckets; i++) {
+               h = select_bucket(htab, i);
+
+               /* pick first element in the bucket */
+               next_l = hlist_entry_safe(
+                               rcu_dereference_raw(hlist_first_rcu(h)),
+                               struct htab_elem, hash_node);
+               if (next_l) {
+                       /* if it's not empty, just return it */
+                       memcpy(next_key, next_l->key, key_size);
+                       return 0;
+               }
+       }
+
+       /* iterated over all buckets and all elements */
+       return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+                                    struct bpf_map *map,
+                                    void *key, u64 map_flags)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct bpf_sock_progs *progs = &htab->progs;
+       struct htab_elem *l_new = NULL, *l_old;
+       struct smap_psock_map_entry *e = NULL;
+       struct hlist_head *head;
+       struct smap_psock *psock;
+       u32 key_size, hash;
+       struct sock *sock;
+       struct bucket *b;
+       int err;
+
+       sock = skops->sk;
+
+       if (sock->sk_type != SOCK_STREAM ||
+           sock->sk_protocol != IPPROTO_TCP)
+               return -EOPNOTSUPP;
+
+       if (unlikely(map_flags > BPF_EXIST))
+               return -EINVAL;
+
+       e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+       if (!e)
+               return -ENOMEM;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       key_size = map->key_size;
+       hash = htab_map_hash(key, key_size);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+       if (err)
+               goto err;
+
+       /* bpf_map_update_elem() can be called in_irq() */
+       raw_spin_lock_bh(&b->lock);
+       l_old = lookup_elem_raw(head, hash, key, key_size);
+       if (l_old && map_flags == BPF_NOEXIST) {
+               err = -EEXIST;
+               goto bucket_err;
+       }
+       if (!l_old && map_flags == BPF_EXIST) {
+               err = -ENOENT;
+               goto bucket_err;
+       }
+
+       l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+       if (IS_ERR(l_new)) {
+               err = PTR_ERR(l_new);
+               goto bucket_err;
+       }
+
+       psock = smap_psock_sk(sock);
+       if (unlikely(!psock)) {
+               err = -EINVAL;
+               goto bucket_err;
+       }
+
+       e->hash_link = l_new;
+       e->htab = container_of(map, struct bpf_htab, map);
+       list_add_tail(&e->list, &psock->maps);
+
+       /* add new element to the head of the list, so that
+        * concurrent search will find it before old elem
+        */
+       hlist_add_head_rcu(&l_new->hash_node, head);
+       if (l_old) {
+               psock = smap_psock_sk(l_old->sk);
+
+               hlist_del_rcu(&l_old->hash_node);
+               smap_list_remove(psock, NULL, l_old);
+               smap_release_sock(psock, l_old->sk);
+               free_htab_elem(htab, l_old);
+       }
+       raw_spin_unlock_bh(&b->lock);
+       return 0;
+bucket_err:
+       raw_spin_unlock_bh(&b->lock);
+err:
+       kfree(e);
+       psock = smap_psock_sk(sock);
+       if (psock)
+               smap_release_sock(psock, sock);
+       return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+                               void *key, void *value, u64 flags)
+{
+       struct bpf_sock_ops_kern skops;
+       u32 fd = *(u32 *)value;
+       struct socket *socket;
+       int err;
+
+       socket = sockfd_lookup(fd, &err);
+       if (!socket)
+               return err;
+
+       skops.sk = socket->sk;
+       if (!skops.sk) {
+               fput(socket->file);
+               return -EINVAL;
+       }
+
+       err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+       fput(socket->file);
+       return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct hlist_head *head;
+       struct bucket *b;
+       struct htab_elem *l;
+       u32 hash, key_size;
+       int ret = -ENOENT;
+
+       key_size = map->key_size;
+       hash = htab_map_hash(key, key_size);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       raw_spin_lock_bh(&b->lock);
+       l = lookup_elem_raw(head, hash, key, key_size);
+       if (l) {
+               struct sock *sock = l->sk;
+               struct smap_psock *psock;
+
+               hlist_del_rcu(&l->hash_node);
+               write_lock_bh(&sock->sk_callback_lock);
+               psock = smap_psock_sk(sock);
+               /* This check handles a racing sock event that can get the
+                * sk_callback_lock before this case but after xchg happens
+                * causing the refcnt to hit zero and sock user data (psock)
+                * to be null and queued for garbage collection.
+                */
+               if (likely(psock)) {
+                       smap_list_remove(psock, NULL, l);
+                       smap_release_sock(psock, sock);
+               }
+               write_unlock_bh(&sock->sk_callback_lock);
+               free_htab_elem(htab, l);
+               ret = 0;
+       }
+       raw_spin_unlock_bh(&b->lock);
+       return ret;
+}
+
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+       struct hlist_head *head;
+       struct htab_elem *l;
+       u32 key_size, hash;
+       struct bucket *b;
+       struct sock *sk;
+
+       key_size = map->key_size;
+       hash = htab_map_hash(key, key_size);
+       b = __select_bucket(htab, hash);
+       head = &b->head;
+
+       raw_spin_lock_bh(&b->lock);
+       l = lookup_elem_raw(head, hash, key, key_size);
+       sk = l ? l->sk : NULL;
+       raw_spin_unlock_bh(&b->lock);
+       return sk;
+}
+
  const struct bpf_map_ops sock_map_ops = {
         .map_alloc = sock_map_alloc,
         .map_free = sock_map_free,
@@ -1933,6 +2370,15 @@ const struct bpf_map_ops sock_map_ops = {
         .map_release_uref = sock_map_release,
  };
  
+const struct bpf_map_ops sock_hash_ops = {
+       .map_alloc = sock_hash_alloc,
+       .map_free = sock_hash_free,
+       .map_lookup_elem = sock_map_lookup,
+       .map_get_next_key = sock_hash_get_next_key,
+       .map_update_elem = sock_hash_update_elem,
+       .map_delete_elem = sock_hash_delete_elem,
+};
+
  BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
            struct bpf_map *, map, void *, key, u64, flags)
  {
@@ -1950,3 +2396,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
         .arg3_type      = ARG_PTR_TO_MAP_KEY,
         .arg4_type      = ARG_ANYTHING,
  };
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+       .func           = bpf_sock_hash_update,
+       .gpl_only       = false,
+       .pkt_access     = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index d92d9c37affd70d3e4b16d97d3dcdc00de818c62..a9e4b1372da6c1635e708e1ace953a19e4030030 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                     func_id != BPF_FUNC_msg_redirect_map)
                         goto error;
                 break;
+       case BPF_MAP_TYPE_SOCKHASH:
+               if (func_id != BPF_FUNC_sk_redirect_hash &&
+                   func_id != BPF_FUNC_sock_hash_update &&
+                   func_id != BPF_FUNC_map_delete_elem &&
+                   func_id != BPF_FUNC_msg_redirect_hash)
+                       goto error;
+               break;
         default:
                 break;
         }
@@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                 break;
         case BPF_FUNC_sk_redirect_map:
         case BPF_FUNC_msg_redirect_map:
+       case BPF_FUNC_sock_map_update:
                 if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
                         goto error;
                 break;
-       case BPF_FUNC_sock_map_update:
-               if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+       case BPF_FUNC_sk_redirect_hash:
+       case BPF_FUNC_msg_redirect_hash:
+       case BPF_FUNC_sock_hash_update:
+               if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
                         goto error;
                 break;
         default:
diff --git a/net/core/filter.c b/net/core/filter.c

index 61a3ed6bac2598f23f133a2ab699dfe52dbdfade..6d0d1560bd70e88c3ebaaf636e4a27e2f3f395d5 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2074,6 +2074,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
         .arg2_type      = ARG_ANYTHING,
  };
  
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+       /* If user passes invalid input drop the packet. */
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+
+       tcb->bpf.flags = flags;
+       tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+       if (!tcb->bpf.sk_redir)
+               return SK_DROP;
+
+       return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+       .func           = bpf_sk_redirect_hash,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
  BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
            struct bpf_map *, map, u32, key, u64, flags)
  {
@@ -2108,6 +2135,31 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
         .arg4_type      = ARG_ANYTHING,
  };
  
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+          struct bpf_map *, map, void *, key, u64, flags)
+{
+       /* If user passes invalid input drop the packet. */
+       if (unlikely(flags & ~(BPF_F_INGRESS)))
+               return SK_DROP;
+
+       msg->flags = flags;
+       msg->sk_redir = __sock_hash_lookup_elem(map, key);
+       if (!msg->sk_redir)
+               return SK_DROP;
+
+       return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+       .func           = bpf_msg_redirect_hash,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_PTR_TO_MAP_KEY,
+       .arg4_type      = ARG_ANYTHING,
+};
+
  BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
            struct bpf_map *, map, u32, key, u64, flags)
  {
@@ -4502,6 +4554,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                 return &bpf_sock_ops_cb_flags_set_proto;
         case BPF_FUNC_sock_map_update:
                 return &bpf_sock_map_update_proto;
+       case BPF_FUNC_sock_hash_update:
+               return &bpf_sock_hash_update_proto;
         default:
                 return bpf_base_func_proto(func_id);
         }
@@ -4513,6 +4567,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
         switch (func_id) {
         case BPF_FUNC_msg_redirect_map:
                 return &bpf_msg_redirect_map_proto;
+       case BPF_FUNC_msg_redirect_hash:
+               return &bpf_msg_redirect_hash_proto;
         case BPF_FUNC_msg_apply_bytes:
                 return &bpf_msg_apply_bytes_proto;
         case BPF_FUNC_msg_cork_bytes:
@@ -4544,6 +4600,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                 return &bpf_get_socket_uid_proto;
         case BPF_FUNC_sk_redirect_map:
                 return &bpf_sk_redirect_map_proto;
+       case BPF_FUNC_sk_redirect_hash:
+               return &bpf_sk_redirect_hash_proto;
         default:
                 return bpf_base_func_proto(func_id);
         }
author	John Fastabend <john.fastabend@gmail.com>
	Mon, 14 May 2018 17:00:17 +0000 (10:00 -0700)
committer	Daniel Borkmann <daniel@iogearbox.net>
	Tue, 15 May 2018 18:41:03 +0000 (20:41 +0200)
include/linux/bpf.h		patch \| blob \| history
include/linux/bpf_types.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
kernel/bpf/core.c		patch \| blob \| history
kernel/bpf/sockmap.c		patch \| blob \| history
kernel/bpf/verifier.c		patch \| blob \| history
net/core/filter.c		patch \| blob \| history