perf core: Allow setting up max frame stack depth via sysctl
authorArnaldo Carvalho de Melo <acme@redhat.com>
Thu, 21 Apr 2016 15:28:50 +0000 (12:28 -0300)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Wed, 27 Apr 2016 13:20:39 +0000 (10:20 -0300)
The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.

And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.

The new file is:

  # cat /proc/sys/kernel/perf_event_max_stack
  127

Chaging it:

  # echo 256 > /proc/sys/kernel/perf_event_max_stack
  # cat /proc/sys/kernel/perf_event_max_stack
  256

But as soon as there is some event using callchains we get:

  # echo 512 > /proc/sys/kernel/perf_event_max_stack
  -bash: echo: write error: Device or resource busy
  #

Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.

Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/20160426002928.GB16708@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
13 files changed:
Documentation/sysctl/kernel.txt
arch/arm/kernel/perf_callchain.c
arch/arm64/kernel/perf_callchain.c
arch/metag/kernel/perf_callchain.c
arch/mips/kernel/perf_event.c
arch/powerpc/perf/callchain.c
arch/sparc/kernel/perf_event.c
arch/x86/events/core.c
arch/xtensa/kernel/perf_event.c
include/linux/perf_event.h
kernel/bpf/stackmap.c
kernel/events/callchain.c
kernel/sysctl.c

index 57653a44b128c821520b071f0ec4b53bfacc7b96..260cde08e92e828a842cbd780e70bec72d7d3826 100644 (file)
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
index 4e02ae5950ff6463e4da472066b55ceef9a970be..27563befa8a2df5b27ea39b56bde584771945dfd 100644 (file)
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
        tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-       while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+       while ((entry->nr < sysctl_perf_event_max_stack) &&
               tail && !((unsigned long)tail & 0x3))
                tail = user_backtrace(tail, entry);
 }
index ff4665462a025d4ec2655ca30d49732a63194e53..32c3c6e70119f4e123498b85f1bc28398e333b13 100644 (file)
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
                tail = (struct frame_tail __user *)regs->regs[29];
 
-               while (entry->nr < PERF_MAX_STACK_DEPTH &&
+               while (entry->nr < sysctl_perf_event_max_stack &&
                       tail && !((unsigned long)tail & 0xf))
                        tail = user_backtrace(tail, entry);
        } else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
                tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-               while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+               while ((entry->nr < sysctl_perf_event_max_stack) &&
                        tail && !((unsigned long)tail & 0x3))
                        tail = compat_user_backtrace(tail, entry);
 #endif
index 315633461a94537c51c96b7ba92420b9715fd523..252abc12a5a31f6221b106b060ffd6926a8a6b5b 100644 (file)
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
        --frame;
 
-       while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+       while ((entry->nr < sysctl_perf_event_max_stack) && frame)
                frame = user_backtrace(frame, entry);
 }
 
index c1cf9c6c3f7705b9c50281d196633d91c8e788e5..5021c546ad07d3e28b7d0ac1969448c32b9a93e2 100644 (file)
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
                addr = *sp++;
                if (__kernel_text_address(addr)) {
                        perf_callchain_store(entry, addr);
-                       if (entry->nr >= PERF_MAX_STACK_DEPTH)
+                       if (entry->nr >= sysctl_perf_event_max_stack)
                                break;
                }
        }
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
        }
        do {
                perf_callchain_store(entry, pc);
-               if (entry->nr >= PERF_MAX_STACK_DEPTH)
+               if (entry->nr >= sysctl_perf_event_max_stack)
                        break;
                pc = unwind_stack(current, &sp, pc, &ra);
        } while (pc);
index e04a6752b39991bbdf5ba389aef524182511fa9c..22d9015c1acc80dea12e78c1ac53f8fcd3b19f0a 100644 (file)
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
        sp = regs->gpr[1];
        perf_callchain_store(entry, next_ip);
 
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+       while (entry->nr < sysctl_perf_event_max_stack) {
                fp = (unsigned long __user *) sp;
                if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
                        return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
        sp = regs->gpr[1];
        perf_callchain_store(entry, next_ip);
 
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+       while (entry->nr < sysctl_perf_event_max_stack) {
                fp = (unsigned int __user *) (unsigned long) sp;
                if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
                        return;
index 6596f66ce1126fa487b7ce8eec19ef17dc4912e7..a4b8b5aed21c7b0fa83b30c7f3783e5c109ccf61 100644 (file)
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
                        }
                }
 #endif
-       } while (entry->nr < PERF_MAX_STACK_DEPTH);
+       } while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
                pc = sf.callers_pc;
                ufp = (unsigned long)sf.fp + STACK_BIAS;
                perf_callchain_store(entry, pc);
-       } while (entry->nr < PERF_MAX_STACK_DEPTH);
+       } while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
                        ufp = (unsigned long)sf.fp;
                }
                perf_callchain_store(entry, pc);
-       } while (entry->nr < PERF_MAX_STACK_DEPTH);
+       } while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
index 041e442a3e2806ed884584758cb8e62abd809e36..41d93d0e972b32d6c3b4d5594cb9ee68bd307e0b 100644 (file)
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
        fp = compat_ptr(ss_base + regs->bp);
        pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+       while (entry->nr < sysctl_perf_event_max_stack) {
                unsigned long bytes;
                frame.next_frame     = 0;
                frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
                return;
 
        pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+       while (entry->nr < sysctl_perf_event_max_stack) {
                unsigned long bytes;
                frame.next_frame             = NULL;
                frame.return_address = 0;
index 54f01188c29c1a4048ac166d9383d7f3a26905c8..a6b00b3af42993e937181a8412c8949f8ae65983 100644 (file)
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
                           struct pt_regs *regs)
 {
-       xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+       xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
                                callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
                         struct pt_regs *regs)
 {
-       xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+       xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
                              callchain_trace, entry);
 }
 
index 85749ae8cb5fac070cac24e9076324085d7db646..a090700ccccaf824bc40bd188aef523da259480e 100644 (file)
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
        __u64                           nr;
-       __u64                           ip[PERF_MAX_STACK_DEPTH];
+       __u64                           ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-       if (entry->nr < PERF_MAX_STACK_DEPTH) {
+       if (entry->nr < sysctl_perf_event_max_stack) {
                entry->ip[entry->nr++] = ip;
                return 0;
        } else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
index 499d9e933f8e5baab0f3de1199797b56fdc3d589..f5a19548be12ee425ba3a90d1faff12b2dccdb8e 100644 (file)
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            value_size < 8 || value_size % 8 ||
-           value_size / 8 > PERF_MAX_STACK_DEPTH)
+           value_size / 8 > sysctl_perf_event_max_stack)
                return ERR_PTR(-EINVAL);
 
        /* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
        struct perf_callchain_entry *trace;
        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
        u32 max_depth = map->value_size / 8;
-       /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-       u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+       /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+       u32 init_nr = sysctl_perf_event_max_stack - max_depth;
        u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
        u32 hash, id, trace_nr, trace_len;
        bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
                return -EFAULT;
 
        /* get_perf_callchain() guarantees that trace->nr >= init_nr
-        * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+        * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
         */
        trace_nr = trace->nr - init_nr;
 
index 343c22f5e867de2bbe6c2220535b413a6ea0f4ed..b9325e7dcba1088d74e2502177d2a22ececce4dc 100644 (file)
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
        struct perf_callchain_entry     *cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+       return (sizeof(struct perf_callchain_entry) +
+               sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
        if (!entries)
                return -ENOMEM;
 
-       size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+       size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
        for_each_possible_cpu(cpu) {
                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
        cpu = smp_processor_id();
 
-       return &entries->cpu_entries[cpu][*rctx];
+       return (((void *)entries->cpu_entries[cpu]) +
+               (*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
        return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int new_value = sysctl_perf_event_max_stack, ret;
+       struct ctl_table new_table = *table;
+
+       new_table.data = &new_value;
+       ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+       if (ret || !write)
+               return ret;
+
+       mutex_lock(&callchain_mutex);
+       if (atomic_read(&nr_callchain_events))
+               ret = -EBUSY;
+       else
+               sysctl_perf_event_max_stack = new_value;
+
+       mutex_unlock(&callchain_mutex);
+
+       return ret;
+}
index 725587f10667eef64326263941bea17b7fb82cf1..c8b318663525d02b2098238341aca72c701966fe 100644 (file)
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
+       {
+               .procname       = "perf_event_max_stack",
+               .data           = NULL, /* filled in by handler */
+               .maxlen         = sizeof(sysctl_perf_event_max_stack),
+               .mode           = 0644,
+               .proc_handler   = perf_event_max_stack_handler,
+               .extra1         = &zero,
+               .extra2         = &six_hundred_forty_kb,
+       },
 #endif
 #ifdef CONFIG_KMEMCHECK
        {