perf_counter: Allow for a wakeup watermark
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Thu, 17 Sep 2009 17:01:10 +0000 (19:01 +0200)
committerIngo Molnar <mingo@elte.hu>
Thu, 17 Sep 2009 20:08:26 +0000 (22:08 +0200)
Currently we wake the mmap() consumer once every PAGE_SIZE of data
and/or once event wakeup_events when specified.

For high speed sampling this results in too many wakeups wrt. the
buffer size, hence change this.

We move the default wakeup limit to 1/4-th the buffer size, and
provide for means to manually specify this limit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
include/linux/perf_counter.h
kernel/perf_counter.c

index 972f90d7a32f7660fa1ef2362c4904c4b3651d28..6c1ef72ea501967e472b24efa54da5cf348cd5fe 100644 (file)
@@ -199,10 +199,14 @@ struct perf_counter_attr {
                                inherit_stat   :  1, /* per task counts       */
                                enable_on_exec :  1, /* next exec enables     */
                                task           :  1, /* trace fork/exit       */
+                               watermark      :  1, /* wakeup_watermark      */
 
-                               __reserved_1   : 50;
+                               __reserved_1   : 49;
 
-       __u32                   wakeup_events;  /* wakeup every n events */
+       union {
+               __u32           wakeup_events;    /* wakeup every n events */
+               __u32           wakeup_watermark; /* bytes before wakeup   */
+       };
        __u32                   __reserved_2;
 
        __u64                   __reserved_3;
@@ -521,6 +525,8 @@ struct perf_mmap_data {
        atomic_t                        wakeup;         /* needs a wakeup    */
        atomic_t                        lost;           /* nr records lost   */
 
+       long                            watermark;      /* wakeup watermark  */
+
        struct perf_counter_mmap_page   *user_page;
        void                            *data_pages[0];
 };
index fe0d1adde8042b375e572bfc2631fa75bb6ea1e2..29b73b6e81463d910040ceb5bd86797282309142 100644 (file)
@@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
        data->nr_pages = nr_pages;
        atomic_set(&data->lock, -1);
 
+       if (counter->attr.watermark) {
+               data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+                                     counter->attr.wakeup_watermark);
+       }
+       if (!data->watermark)
+               data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
        rcu_assign_pointer(counter->data, data);
 
        return 0;
@@ -2517,23 +2524,15 @@ struct perf_output_handle {
        unsigned long           flags;
 };
 
-static bool perf_output_space(struct perf_mmap_data *data,
-                             unsigned int offset, unsigned int head)
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+                             unsigned long offset, unsigned long head)
 {
-       unsigned long tail;
        unsigned long mask;
 
        if (!data->writable)
                return true;
 
        mask = (data->nr_pages << PAGE_SHIFT) - 1;
-       /*
-        * Userspace could choose to issue a mb() before updating the tail
-        * pointer. So that all reads will be completed before the write is
-        * issued.
-        */
-       tail = ACCESS_ONCE(data->user_page->data_tail);
-       smp_rmb();
 
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
@@ -2679,7 +2678,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 {
        struct perf_counter *output_counter;
        struct perf_mmap_data *data;
-       unsigned int offset, head;
+       unsigned long tail, offset, head;
        int have_lost;
        struct {
                struct perf_event_header header;
@@ -2717,16 +2716,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
        perf_output_lock(handle);
 
        do {
+               /*
+                * Userspace could choose to issue a mb() before updating the
+                * tail pointer. So that all reads will be completed before the
+                * write is issued.
+                */
+               tail = ACCESS_ONCE(data->user_page->data_tail);
+               smp_rmb();
                offset = head = atomic_long_read(&data->head);
                head += size;
-               if (unlikely(!perf_output_space(data, offset, head)))
+               if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
        handle->offset  = offset;
        handle->head    = head;
 
-       if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+       if (head - tail > data->watermark)
                atomic_set(&data->wakeup, 1);
 
        if (have_lost) {