bpf: extend stackmap to save binary_build_id+offset instead of address

author Song Liu <songliubraving@fb.com>

Wed, 14 Mar 2018 17:23:21 +0000 (10:23 -0700)

committer Daniel Borkmann <daniel@iogearbox.net>

Thu, 15 Mar 2018 00:09:28 +0000 (01:09 +0100)
author Song Liu <songliubraving@fb.com>
Wed, 14 Mar 2018 17:23:21 +0000 (10:23 -0700)
committer Daniel Borkmann <daniel@iogearbox.net>
Thu, 15 Mar 2018 00:09:28 +0000 (01:09 +0100)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 2a66769e58753f802c5f2df8c92d131e0ed5edd9..1e15d1724d89aade8ae9c4f4860c374e35d72359 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -231,6 +231,28 @@ enum bpf_attach_type {
  #define BPF_F_RDONLY           (1U << 3)
  #define BPF_F_WRONLY           (1U << 4)
  
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID   (1U << 5)
+
+enum bpf_stack_build_id_status {
+       /* user space need an empty entry to identify end of a trace */
+       BPF_STACK_BUILD_ID_EMPTY = 0,
+       /* with valid build_id and offset */
+       BPF_STACK_BUILD_ID_VALID = 1,
+       /* couldn't get build_id, fallback to ip */
+       BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+       __s32           status;
+       unsigned char   build_id[BPF_BUILD_ID_SIZE];
+       union {
+               __u64   offset;
+               __u64   ip;
+       };
+};
+
  union bpf_attr {
         struct { /* anonymous struct used by BPF_MAP_CREATE command */
                 __u32   map_type;       /* one of enum bpf_map_type */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c

index b0ecf43f5894d12de9a20f4399e79e0b6d2979b8..57eeb1234b67e7dabd555e9562b0a0b59cd57abb 100644 (file)
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
  #include <linux/filter.h>
  #include <linux/stacktrace.h>
  #include <linux/perf_event.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
  #include "percpu_freelist.h"
  
-#define STACK_CREATE_FLAG_MASK \
-       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK                                 \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |        \
+        BPF_F_STACK_BUILD_ID)
  
  struct stack_map_bucket {
         struct pcpu_freelist_node fnode;
         u32 hash;
         u32 nr;
-       u64 ip[];
+       u64 data[];
  };
  
  struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
         struct stack_map_bucket *buckets[];
  };
  
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+       return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+       return stack_map_use_build_id(map) ?
+               sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
  static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
  {
         u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
  
         /* check sanity of attributes */
         if (attr->max_entries == 0 || attr->key_size != 4 ||
-           value_size < 8 || value_size % 8 ||
-           value_size / 8 > sysctl_perf_event_max_stack)
+           value_size < 8 || value_size % 8)
+               return ERR_PTR(-EINVAL);
+
+       BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+       if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+               if (value_size % sizeof(struct bpf_stack_build_id) ||
+                   value_size / sizeof(struct bpf_stack_build_id)
+                   > sysctl_perf_event_max_stack)
+                       return ERR_PTR(-EINVAL);
+       } else if (value_size / 8 > sysctl_perf_event_max_stack)
                 return ERR_PTR(-EINVAL);
  
         /* hash table size must be power of 2 */
@@ -114,13 +136,184 @@ free_smap:
         return ERR_PTR(err);
  }
  
+#define BPF_BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int stack_map_parse_build_id(void *page_addr,
+                                          unsigned char *build_id,
+                                          void *note_start,
+                                          Elf32_Word note_size)
+{
+       Elf32_Word note_offs = 0, new_offs;
+
+       /* check for overflow */
+       if (note_start < page_addr || note_start + note_size < note_start)
+               return -EINVAL;
+
+       /* only supports note that fits in the first page */
+       if (note_start + note_size > page_addr + PAGE_SIZE)
+               return -EINVAL;
+
+       while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+               Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+
+               if (nhdr->n_type == BPF_BUILD_ID &&
+                   nhdr->n_namesz == sizeof("GNU") &&
+                   nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+                       memcpy(build_id,
+                              note_start + note_offs +
+                              ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+                              BPF_BUILD_ID_SIZE);
+                       return 0;
+               }
+               new_offs = note_offs + sizeof(Elf32_Nhdr) +
+                       ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+               if (new_offs <= note_offs)  /* overflow */
+                       break;
+               note_offs = new_offs;
+       }
+       return -EINVAL;
+}
+
+/* Parse build ID from 32-bit ELF */
+static int stack_map_get_build_id_32(void *page_addr,
+                                    unsigned char *build_id)
+{
+       Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+       Elf32_Phdr *phdr;
+       int i;
+
+       /* only supports phdr that fits in one page */
+       if (ehdr->e_phnum >
+           (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+               return -EINVAL;
+
+       phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+
+       for (i = 0; i < ehdr->e_phnum; ++i)
+               if (phdr[i].p_type == PT_NOTE)
+                       return stack_map_parse_build_id(page_addr, build_id,
+                                       page_addr + phdr[i].p_offset,
+                                       phdr[i].p_filesz);
+       return -EINVAL;
+}
+
+/* Parse build ID from 64-bit ELF */
+static int stack_map_get_build_id_64(void *page_addr,
+                                    unsigned char *build_id)
+{
+       Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+       Elf64_Phdr *phdr;
+       int i;
+
+       /* only supports phdr that fits in one page */
+       if (ehdr->e_phnum >
+           (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+               return -EINVAL;
+
+       phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+
+       for (i = 0; i < ehdr->e_phnum; ++i)
+               if (phdr[i].p_type == PT_NOTE)
+                       return stack_map_parse_build_id(page_addr, build_id,
+                                       page_addr + phdr[i].p_offset,
+                                       phdr[i].p_filesz);
+       return -EINVAL;
+}
+
+/* Parse build ID of ELF file mapped to vma */
+static int stack_map_get_build_id(struct vm_area_struct *vma,
+                                 unsigned char *build_id)
+{
+       Elf32_Ehdr *ehdr;
+       struct page *page;
+       void *page_addr;
+       int ret;
+
+       /* only works for page backed storage  */
+       if (!vma->vm_file)
+               return -EINVAL;
+
+       page = find_get_page(vma->vm_file->f_mapping, 0);
+       if (!page)
+               return -EFAULT; /* page not mapped */
+
+       ret = -EINVAL;
+       page_addr = page_address(page);
+       ehdr = (Elf32_Ehdr *)page_addr;
+
+       /* compare magic x7f "ELF" */
+       if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+               goto out;
+
+       /* only support executable file and shared object file */
+       if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+               goto out;
+
+       if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+               ret = stack_map_get_build_id_32(page_addr, build_id);
+       else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+               ret = stack_map_get_build_id_64(page_addr, build_id);
+out:
+       put_page(page);
+       return ret;
+}
+
+static void stack_map_get_build_id_offset(struct bpf_map *map,
+                                         struct stack_map_bucket *bucket,
+                                         u64 *ips, u32 trace_nr, bool user)
+{
+       int i;
+       struct vm_area_struct *vma;
+       struct bpf_stack_build_id *id_offs;
+
+       bucket->nr = trace_nr;
+       id_offs = (struct bpf_stack_build_id *)bucket->data;
+
+       /*
+        * We cannot do up_read() in nmi context, so build_id lookup is
+        * only supported for non-nmi events. If at some point, it is
+        * possible to run find_vma() without taking the semaphore, we
+        * would like to allow build_id lookup in nmi context.
+        *
+        * Same fallback is used for kernel stack (!user) on a stackmap
+        * with build_id.
+        */
+       if (!user || !current || !current->mm || in_nmi() ||
+           down_read_trylock(&current->mm->mmap_sem) == 0) {
+               /* cannot access current->mm, fall back to ips */
+               for (i = 0; i < trace_nr; i++) {
+                       id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+                       id_offs[i].ip = ips[i];
+               }
+               return;
+       }
+
+       for (i = 0; i < trace_nr; i++) {
+               vma = find_vma(current->mm, ips[i]);
+               if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+                       /* per entry fall back to ips */
+                       id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+                       id_offs[i].ip = ips[i];
+                       continue;
+               }
+               id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
+                       - vma->vm_start;
+               id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+       }
+       up_read(&current->mm->mmap_sem);
+}
+
  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
            u64, flags)
  {
         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
         struct perf_callchain_entry *trace;
         struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
-       u32 max_depth = map->value_size / 8;
+       u32 max_depth = map->value_size / stack_map_data_size(map);
         /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
         u32 init_nr = sysctl_perf_event_max_stack - max_depth;
         u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
         bool user = flags & BPF_F_USER_STACK;
         bool kernel = !user;
         u64 *ips;
+       bool hash_matches;
  
         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
                                BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
         id = hash & (smap->n_buckets - 1);
         bucket = READ_ONCE(smap->buckets[id]);
  
-       if (bucket && bucket->hash == hash) {
-               if (flags & BPF_F_FAST_STACK_CMP)
+       hash_matches = bucket && bucket->hash == hash;
+       /* fast cmp */
+       if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+               return id;
+
+       if (stack_map_use_build_id(map)) {
+               /* for build_id+offset, pop a bucket before slow cmp */
+               new_bucket = (struct stack_map_bucket *)
+                       pcpu_freelist_pop(&smap->freelist);
+               if (unlikely(!new_bucket))
+                       return -ENOMEM;
+               stack_map_get_build_id_offset(map, new_bucket, ips,
+                                             trace_nr, user);
+               trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+               if (hash_matches && bucket->nr == trace_nr &&
+                   memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+                       pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
                         return id;
-               if (bucket->nr == trace_nr &&
-                   memcmp(bucket->ip, ips, trace_len) == 0)
+               }
+               if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+                       pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+                       return -EEXIST;
+               }
+       } else {
+               if (hash_matches && bucket->nr == trace_nr &&
+                   memcmp(bucket->data, ips, trace_len) == 0)
                         return id;
+               if (bucket && !(flags & BPF_F_REUSE_STACKID))
+                       return -EEXIST;
+
+               new_bucket = (struct stack_map_bucket *)
+                       pcpu_freelist_pop(&smap->freelist);
+               if (unlikely(!new_bucket))
+                       return -ENOMEM;
+               memcpy(new_bucket->data, ips, trace_len);
         }
  
-       /* this call stack is not in the map, try to add it */
-       if (bucket && !(flags & BPF_F_REUSE_STACKID))
-               return -EEXIST;
-
-       new_bucket = (struct stack_map_bucket *)
-               pcpu_freelist_pop(&smap->freelist);
-       if (unlikely(!new_bucket))
-               return -ENOMEM;
-
-       memcpy(new_bucket->ip, ips, trace_len);
         new_bucket->hash = hash;
         new_bucket->nr = trace_nr;
  
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
         if (!bucket)
                 return -ENOENT;
  
-       trace_len = bucket->nr * sizeof(u64);
-       memcpy(value, bucket->ip, trace_len);
+       trace_len = bucket->nr * stack_map_data_size(map);
+       memcpy(value, bucket->data, trace_len);
         memset(value + trace_len, 0, map->value_size - trace_len);
  
         old_bucket = xchg(&smap->buckets[id], bucket);
author	Song Liu <songliubraving@fb.com>
	Wed, 14 Mar 2018 17:23:21 +0000 (10:23 -0700)
committer	Daniel Borkmann <daniel@iogearbox.net>
	Thu, 15 Mar 2018 00:09:28 +0000 (01:09 +0100)
include/uapi/linux/bpf.h		patch \| blob \| history
kernel/bpf/stackmap.c		patch \| blob \| history