bpf: introduce bpf_spin_lock
authorAlexei Starovoitov <ast@kernel.org>
Thu, 31 Jan 2019 23:40:04 +0000 (15:40 -0800)
committerDaniel Borkmann <daniel@iogearbox.net>
Fri, 1 Feb 2019 19:55:38 +0000 (20:55 +0100)
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.

Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}

Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
  cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
  It drastically simplifies implementation yet allows bpf program to use
  any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
  bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
  a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
  Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
  bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.
- tracing progs and socket filter progs cannot use bpf_spin_lock due to
  insufficient preemption checks

Implementation details:
- cgroup-bpf class of programs can nest with xdp/tc programs.
  Hence bpf_spin_lock is equivalent to spin_lock_irqsave.
  Other solutions to avoid nested bpf_spin_lock are possible.
  Like making sure that all networking progs run with softirq disabled.
  spin_lock_irqsave is the simplest and doesn't add overhead to the
  programs that don't use it.
- arch_spinlock_t is used when its implemented as queued_spin_lock
- archs can force their own arch_spinlock_t
- on architectures where queued_spin_lock is not available and
  sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used.
- presence of bpf_spin_lock inside map value could have been indicated via
  extra flag during map_create, but specifying it via BTF is cleaner.
  It provides introspection for map key/value and reduces user mistakes.

Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
  to request kernel to grab bpf_spin_lock before rewriting the value.
  That will serialize access to map elements.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
14 files changed:
include/linux/bpf.h
include/linux/bpf_verifier.h
include/linux/btf.h
include/uapi/linux/bpf.h
kernel/Kconfig.locks
kernel/bpf/arraymap.c
kernel/bpf/btf.c
kernel/bpf/core.c
kernel/bpf/hashtab.c
kernel/bpf/helpers.c
kernel/bpf/map_in_map.c
kernel/bpf/syscall.c
kernel/bpf/verifier.c
net/core/filter.c

index 0394f1f9213bf2542eb1de2190420e5047da51c7..2ae615b48bb81a3b4fdfc2ca6eb6dbdaf97a0dd7 100644 (file)
@@ -72,14 +72,15 @@ struct bpf_map {
        u32 value_size;
        u32 max_entries;
        u32 map_flags;
-       u32 pages;
+       int spin_lock_off; /* >=0 valid offset, <0 error */
        u32 id;
        int numa_node;
        u32 btf_key_type_id;
        u32 btf_value_type_id;
        struct btf *btf;
+       u32 pages;
        bool unpriv_array;
-       /* 55 bytes hole */
+       /* 51 bytes hole */
 
        /* The 3rd and 4th cacheline with misc members to avoid false sharing
         * particularly with refcounting.
@@ -91,6 +92,34 @@ struct bpf_map {
        char name[BPF_OBJ_NAME_LEN];
 };
 
+static inline bool map_value_has_spin_lock(const struct bpf_map *map)
+{
+       return map->spin_lock_off >= 0;
+}
+
+static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
+{
+       if (likely(!map_value_has_spin_lock(map)))
+               return;
+       *(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
+               (struct bpf_spin_lock){};
+}
+
+/* copy everything but bpf_spin_lock */
+static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
+{
+       if (unlikely(map_value_has_spin_lock(map))) {
+               u32 off = map->spin_lock_off;
+
+               memcpy(dst, src, off);
+               memcpy(dst + off + sizeof(struct bpf_spin_lock),
+                      src + off + sizeof(struct bpf_spin_lock),
+                      map->value_size - off - sizeof(struct bpf_spin_lock));
+       } else {
+               memcpy(dst, src, map->value_size);
+       }
+}
+
 struct bpf_offload_dev;
 struct bpf_offloaded_map;
 
@@ -162,6 +191,7 @@ enum bpf_arg_type {
        ARG_PTR_TO_CTX,         /* pointer to context */
        ARG_ANYTHING,           /* any (initialized) argument is ok */
        ARG_PTR_TO_SOCKET,      /* pointer to bpf_sock */
+       ARG_PTR_TO_SPIN_LOCK,   /* pointer to bpf_spin_lock */
 };
 
 /* type of values returned from helper functions */
@@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
-
+extern const struct bpf_func_proto bpf_spin_lock_proto;
+extern const struct bpf_func_proto bpf_spin_unlock_proto;
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 
 /* Shared helpers among cBPF and eBPF. */
index 0620e418dde5d48161d659c291839a0e93f93621..69f7a3449eda83a8a25fd1f5ac3dedc36108deba 100644 (file)
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
        /* call stack tracking */
        struct bpf_func_state *frame[MAX_CALL_FRAMES];
        u32 curframe;
+       u32 active_spin_lock;
        bool speculative;
 };
 
index 12502e25e76764c183d107b5bf00c76a02f3e6af..455d31b55828d2240f3e0f27920be11b75dfee57 100644 (file)
@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
                           const struct btf_member *m,
                           u32 expected_offset, u32 expected_size);
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
index 60b99b730a41857a0bac432af81b294f1df7877e..86f7c438d40f725107521c1fac8b0f038954d695 100644 (file)
@@ -2422,7 +2422,9 @@ union bpf_attr {
        FN(map_peek_elem),              \
        FN(msg_push_data),              \
        FN(msg_pop_data),               \
-       FN(rc_pointer_rel),
+       FN(rc_pointer_rel),             \
+       FN(spin_lock),                  \
+       FN(spin_unlock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3056,4 +3058,7 @@ struct bpf_line_info {
        __u32   line_col;
 };
 
+struct bpf_spin_lock {
+       __u32   val;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
index 84d882f3e2993b9ce3fdef1cc6aff5c52a8f4622..fbba478ae52294f5306818deb15a9259b0132d53 100644 (file)
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
        def_bool y if ARCH_USE_QUEUED_SPINLOCKS
        depends on SMP
 
+config BPF_ARCH_SPINLOCK
+       bool
+
 config ARCH_USE_QUEUED_RWLOCKS
        bool
 
index 25632a75d630781e0753eb2545bb75d98afb6ccb..d6d979910a2a8516fb0b0c53123d341fa526f971 100644 (file)
@@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
                memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
                       value, map->value_size);
        else
-               memcpy(array->value +
-                      array->elem_size * (index & array->index_mask),
-                      value, map->value_size);
+               copy_map_value(map,
+                              array->value +
+                              array->elem_size * (index & array->index_mask),
+                              value);
        return 0;
 }
 
index 3d661f0606fedfceb9cf5cd4ee03e389dae4edca..7019c1f05cabea7e49571d61bc93b7330680bba2 100644 (file)
@@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
        return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
 }
 
+static bool __btf_type_is_struct(const struct btf_type *t)
+{
+       return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
 static bool btf_type_is_array(const struct btf_type *t)
 {
        return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
        btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+       const struct btf_member *member;
+       u32 i, off = -ENOENT;
+
+       if (!__btf_type_is_struct(t))
+               return -EINVAL;
+
+       for_each_member(i, t, member) {
+               const struct btf_type *member_type = btf_type_by_id(btf,
+                                                                   member->type);
+               if (!__btf_type_is_struct(member_type))
+                       continue;
+               if (member_type->size != sizeof(struct bpf_spin_lock))
+                       continue;
+               if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
+                          "bpf_spin_lock"))
+                       continue;
+               if (off != -ENOENT)
+                       /* only one 'struct bpf_spin_lock' is allowed */
+                       return -E2BIG;
+               off = btf_member_bit_offset(t, member);
+               if (off % 8)
+                       /* valid C code cannot generate such BTF */
+                       return -EINVAL;
+               off /= 8;
+               if (off % __alignof__(struct bpf_spin_lock))
+                       /* valid struct bpf_spin_lock will be 4 byte aligned */
+                       return -EINVAL;
+       }
+       return off;
+}
+
 static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
                                u32 type_id, void *data, u8 bits_offset,
                                struct seq_file *m)
index f13c543b7b3673013ef5ee126f2fc03fcbfac01b..ef88b167959d64fe2bb8da87a3300e65825e21f2 100644 (file)
@@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_map_push_elem_proto __weak;
 const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
 const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
index 4b7c76765d9d6998cdbb273ced0d30770d3f6b1e..6d3b22c5ad689eb9006882a70e7f6e8b4841bc5b 100644 (file)
@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
               BITS_PER_LONG == 64;
 }
 
-static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
-{
-       u32 size = htab->map.value_size;
-
-       if (percpu || fd_htab_map_needs_adjust(htab))
-               size = round_up(size, 8);
-       return size;
-}
-
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                                         void *value, u32 key_size, u32 hash,
                                         bool percpu, bool onallcpus,
                                         struct htab_elem *old_elem)
 {
-       u32 size = htab_size_value(htab, percpu);
+       u32 size = htab->map.value_size;
        bool prealloc = htab_is_prealloc(htab);
        struct htab_elem *l_new, **pl_new;
        void __percpu *pptr;
@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                        l_new = ERR_PTR(-ENOMEM);
                        goto dec_count;
                }
+               check_and_init_map_lock(&htab->map,
+                                       l_new->key + round_up(key_size, 8));
        }
 
        memcpy(l_new->key, key, key_size);
        if (percpu) {
+               size = round_up(size, 8);
                if (prealloc) {
                        pptr = htab_elem_get_ptr(l_new, key_size);
                } else {
@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 
                if (!prealloc)
                        htab_elem_set_ptr(l_new, key_size, pptr);
-       } else {
+       } else if (fd_htab_map_needs_adjust(htab)) {
+               size = round_up(size, 8);
                memcpy(l_new->key + round_up(key_size, 8), value, size);
+       } else {
+               copy_map_value(&htab->map,
+                              l_new->key + round_up(key_size, 8),
+                              value);
        }
 
        l_new->hash = hash;
index a74972b07e7467deb018853b554027faf20c26d8..fbe544761628c412703f79150bf95e7cae4f75ed 100644 (file)
@@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
        .arg2_type      = ARG_CONST_SIZE,
 };
 
+#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+       arch_spinlock_t *l = (void *)lock;
+       union {
+               __u32 val;
+               arch_spinlock_t lock;
+       } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
+
+       compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
+       BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
+       BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+       arch_spin_lock(l);
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+       arch_spinlock_t *l = (void *)lock;
+
+       arch_spin_unlock(l);
+}
+
+#else
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+       atomic_t *l = (void *)lock;
+
+       BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
+       do {
+               atomic_cond_read_relaxed(l, !VAL);
+       } while (atomic_xchg(l, 1));
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+       atomic_t *l = (void *)lock;
+
+       atomic_set_release(l, 0);
+}
+
+#endif
+
+static DEFINE_PER_CPU(unsigned long, irqsave_flags);
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __bpf_spin_lock(lock);
+       __this_cpu_write(irqsave_flags, flags);
+       return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+       .func           = bpf_spin_lock,
+       .gpl_only       = false,
+       .ret_type       = RET_VOID,
+       .arg1_type      = ARG_PTR_TO_SPIN_LOCK,
+};
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+       unsigned long flags;
+
+       flags = __this_cpu_read(irqsave_flags);
+       __bpf_spin_unlock(lock);
+       local_irq_restore(flags);
+       return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+       .func           = bpf_spin_unlock,
+       .gpl_only       = false,
+       .ret_type       = RET_VOID,
+       .arg1_type      = ARG_PTR_TO_SPIN_LOCK,
+};
+
 #ifdef CONFIG_CGROUPS
 BPF_CALL_0(bpf_get_current_cgroup_id)
 {
index 52378d3e34b3296d5ff07b1418b9b9307da53bf5..583346a0ab299773ce8ab71d0f920a9cd36a20cf 100644 (file)
@@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
                return ERR_PTR(-EINVAL);
        }
 
+       if (map_value_has_spin_lock(inner_map)) {
+               fdput(f);
+               return ERR_PTR(-ENOTSUPP);
+       }
+
        inner_map_meta_size = sizeof(*inner_map_meta);
        /* In some cases verifier needs to access beyond just base map. */
        if (inner_map->ops == &array_map_ops)
index b155cd17c1bd77d6c40e215a09ef2af37bd2a077..ebf0a673cb833bb37339f12af39140c9f207f595 100644 (file)
@@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map,
        return -ENOTSUPP;
 }
 
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
                         u32 btf_key_id, u32 btf_value_id)
 {
        const struct btf_type *key_type, *value_type;
@@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
        if (!value_type || value_size != map->value_size)
                return -EINVAL;
 
+       map->spin_lock_off = btf_find_spin_lock(btf, value_type);
+
+       if (map_value_has_spin_lock(map)) {
+               if (map->map_type != BPF_MAP_TYPE_HASH &&
+                   map->map_type != BPF_MAP_TYPE_ARRAY)
+                       return -ENOTSUPP;
+               if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
+                   map->value_size) {
+                       WARN_ONCE(1,
+                                 "verifier bug spin_lock_off %d value_size %d\n",
+                                 map->spin_lock_off, map->value_size);
+                       return -EFAULT;
+               }
+       }
+
        if (map->ops->map_check_btf)
                ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 
@@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr)
                map->btf = btf;
                map->btf_key_type_id = attr->btf_key_type_id;
                map->btf_value_type_id = attr->btf_value_type_id;
+       } else {
+               map->spin_lock_off = -EINVAL;
        }
 
        err = security_bpf_map_alloc(map);
@@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr)
                        err = -ENOENT;
                } else {
                        err = 0;
-                       memcpy(value, ptr, value_size);
+                       copy_map_value(map, value, ptr);
                }
                rcu_read_unlock();
        }
index 8c1c21cd50b4eb446392f1f4a9f479053fc7876c..38892bdee651bf3206c541486b76be1a0e55491c 100644 (file)
@@ -213,6 +213,7 @@ struct bpf_call_arg_meta {
        s64 msize_smax_value;
        u64 msize_umax_value;
        int ptr_id;
+       int func_id;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg)
        return type_is_refcounted(reg->type);
 }
 
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+       return reg->type == PTR_TO_MAP_VALUE &&
+               map_value_has_spin_lock(reg->map_ptr);
+}
+
 static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
 {
        return type_is_refcounted_or_null(reg->type);
@@ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
        }
        dst_state->speculative = src->speculative;
        dst_state->curframe = src->curframe;
+       dst_state->active_spin_lock = src->active_spin_lock;
        for (i = 0; i <= src->curframe; i++) {
                dst = dst_state->frame[i];
                if (!dst) {
@@ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
        if (err)
                verbose(env, "R%d max value is outside of the array range\n",
                        regno);
+
+       if (map_value_has_spin_lock(reg->map_ptr)) {
+               u32 lock = reg->map_ptr->spin_lock_off;
+
+               /* if any part of struct bpf_spin_lock can be touched by
+                * load/store reject this program.
+                * To check that [x1, x2) overlaps with [y1, y2)
+                * it is sufficient to check x1 < y2 && y1 < x2.
+                */
+               if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
+                    lock < reg->umax_value + off + size) {
+                       verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
+                       return -EACCES;
+               }
+       }
        return err;
 }
 
@@ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
        }
 }
 
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
+ * value_or_null->value transition, since the verifier only cares about
+ * the range of access to valid map value pointer and doesn't care about actual
+ * address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * cur_state->active_spin_lock remembers which map value element got locked
+ * and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
+                            bool is_lock)
+{
+       struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+       struct bpf_verifier_state *cur = env->cur_state;
+       bool is_const = tnum_is_const(reg->var_off);
+       struct bpf_map *map = reg->map_ptr;
+       u64 val = reg->var_off.value;
+
+       if (reg->type != PTR_TO_MAP_VALUE) {
+               verbose(env, "R%d is not a pointer to map_value\n", regno);
+               return -EINVAL;
+       }
+       if (!is_const) {
+               verbose(env,
+                       "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
+                       regno);
+               return -EINVAL;
+       }
+       if (!map->btf) {
+               verbose(env,
+                       "map '%s' has to have BTF in order to use bpf_spin_lock\n",
+                       map->name);
+               return -EINVAL;
+       }
+       if (!map_value_has_spin_lock(map)) {
+               if (map->spin_lock_off == -E2BIG)
+                       verbose(env,
+                               "map '%s' has more than one 'struct bpf_spin_lock'\n",
+                               map->name);
+               else if (map->spin_lock_off == -ENOENT)
+                       verbose(env,
+                               "map '%s' doesn't have 'struct bpf_spin_lock'\n",
+                               map->name);
+               else
+                       verbose(env,
+                               "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+                               map->name);
+               return -EINVAL;
+       }
+       if (map->spin_lock_off != val + reg->off) {
+               verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
+                       val + reg->off);
+               return -EINVAL;
+       }
+       if (is_lock) {
+               if (cur->active_spin_lock) {
+                       verbose(env,
+                               "Locking two bpf_spin_locks are not allowed\n");
+                       return -EINVAL;
+               }
+               cur->active_spin_lock = reg->id;
+       } else {
+               if (!cur->active_spin_lock) {
+                       verbose(env, "bpf_spin_unlock without taking a lock\n");
+                       return -EINVAL;
+               }
+               if (cur->active_spin_lock != reg->id) {
+                       verbose(env, "bpf_spin_unlock of different lock\n");
+                       return -EINVAL;
+               }
+               cur->active_spin_lock = 0;
+       }
+       return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
        return type == ARG_PTR_TO_MEM ||
@@ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                        return -EFAULT;
                }
                meta->ptr_id = reg->id;
+       } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+               if (meta->func_id == BPF_FUNC_spin_lock) {
+                       if (process_spin_lock(env, regno, true))
+                               return -EACCES;
+               } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+                       if (process_spin_lock(env, regno, false))
+                               return -EACCES;
+               } else {
+                       verbose(env, "verifier internal error\n");
+                       return -EFAULT;
+               }
        } else if (arg_type_is_mem_ptr(arg_type)) {
                expected_type = PTR_TO_STACK;
                /* One exception here. In case function allows for NULL to be
@@ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                return err;
        }
 
+       meta.func_id = func_id;
        /* check args */
        err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
        if (err)
@@ -4473,7 +4593,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
                        reg->type = PTR_TO_SOCKET;
                }
-               if (is_null || !reg_is_refcounted(reg)) {
+               if (is_null || !(reg_is_refcounted(reg) ||
+                                reg_may_point_to_spin_lock(reg))) {
                        /* We don't need id from this point onwards anymore,
                         * thus we should better reset it, so that state
                         * pruning has chances to take effect.
@@ -4871,6 +4992,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
                return err;
        }
 
+       if (env->cur_state->active_spin_lock) {
+               verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
+               return -EINVAL;
+       }
+
        if (regs[BPF_REG_6].type != PTR_TO_CTX) {
                verbose(env,
                        "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -5607,8 +5733,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
        case PTR_TO_MAP_VALUE:
                /* If the new min/max/var_off satisfy the old ones and
                 * everything else matches, we are OK.
-                * We don't care about the 'id' value, because nothing
-                * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+                * 'id' is not compared, since it's only used for maps with
+                * bpf_spin_lock inside map element and in such cases if
+                * the rest of the prog is valid for one map element then
+                * it's valid for all map elements regardless of the key
+                * used in bpf_map_lookup()
                 */
                return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
                       range_within(rold, rcur) &&
@@ -5811,6 +5940,9 @@ static bool states_equal(struct bpf_verifier_env *env,
        if (old->speculative && !cur->speculative)
                return false;
 
+       if (old->active_spin_lock != cur->active_spin_lock)
+               return false;
+
        /* for states to be equal callsites have to be the same
         * and all frame states need to be equivalent
         */
@@ -6229,6 +6361,12 @@ static int do_check(struct bpf_verifier_env *env)
                                        return -EINVAL;
                                }
 
+                               if (env->cur_state->active_spin_lock &&
+                                   (insn->src_reg == BPF_PSEUDO_CALL ||
+                                    insn->imm != BPF_FUNC_spin_unlock)) {
+                                       verbose(env, "function calls are not allowed while holding a lock\n");
+                                       return -EINVAL;
+                               }
                                if (insn->src_reg == BPF_PSEUDO_CALL)
                                        err = check_func_call(env, insn, &env->insn_idx);
                                else
@@ -6259,6 +6397,11 @@ static int do_check(struct bpf_verifier_env *env)
                                        return -EINVAL;
                                }
 
+                               if (env->cur_state->active_spin_lock) {
+                                       verbose(env, "bpf_spin_unlock is missing\n");
+                                       return -EINVAL;
+                               }
+
                                if (state->curframe) {
                                        /* exit from nested function */
                                        env->prev_insn_idx = env->insn_idx;
@@ -6356,6 +6499,19 @@ static int check_map_prealloc(struct bpf_map *map)
                !(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
+static bool is_tracing_prog_type(enum bpf_prog_type type)
+{
+       switch (type) {
+       case BPF_PROG_TYPE_KPROBE:
+       case BPF_PROG_TYPE_TRACEPOINT:
+       case BPF_PROG_TYPE_PERF_EVENT:
+       case BPF_PROG_TYPE_RAW_TRACEPOINT:
+               return true;
+       default:
+               return false;
+       }
+}
+
 static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                                        struct bpf_map *map,
                                        struct bpf_prog *prog)
@@ -6378,6 +6534,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                }
        }
 
+       if ((is_tracing_prog_type(prog->type) ||
+            prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
+           map_value_has_spin_lock(map)) {
+               verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+               return -EINVAL;
+       }
+
        if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
            !bpf_offload_prog_map_match(prog, map)) {
                verbose(env, "offload device mismatch between prog and map\n");
index 41984ad4b9b4a4597370f8469e8f49782ee783b8..3a49f68eda10904ca9bd4c4ef3f8bd55f14bc54e 100644 (file)
@@ -5314,10 +5314,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)
                return &bpf_tail_call_proto;
        case BPF_FUNC_ktime_get_ns:
                return &bpf_ktime_get_ns_proto;
+       default:
+               break;
+       }
+
+       if (!capable(CAP_SYS_ADMIN))
+               return NULL;
+
+       switch (func_id) {
+       case BPF_FUNC_spin_lock:
+               return &bpf_spin_lock_proto;
+       case BPF_FUNC_spin_unlock:
+               return &bpf_spin_unlock_proto;
        case BPF_FUNC_trace_printk:
-               if (capable(CAP_SYS_ADMIN))
-                       return bpf_get_trace_printk_proto();
-               /* else, fall through */
+               return bpf_get_trace_printk_proto();
        default:
                return NULL;
        }