fix memory leaks in tracing_buffers_splice_read()
[cascardo/linux.git] / kernel / trace / trace.c
index a2f0b9f..77eeab2 100644 (file)
@@ -253,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec)
 #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK |                     \
               TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
 
+/* trace_flags that are default zero for instances */
+#define ZEROED_TRACE_FLAGS \
+       TRACE_ITER_EVENT_FORK
 
 /*
  * The global_trace is the descriptor that holds the tracing
@@ -303,33 +306,18 @@ void trace_array_put(struct trace_array *this_tr)
        mutex_unlock(&trace_types_lock);
 }
 
-int filter_check_discard(struct trace_event_file *file, void *rec,
-                        struct ring_buffer *buffer,
-                        struct ring_buffer_event *event)
-{
-       if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
-           !filter_match_preds(file->filter, rec)) {
-               ring_buffer_discard_commit(buffer, event);
-               return 1;
-       }
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(filter_check_discard);
-
 int call_filter_check_discard(struct trace_event_call *call, void *rec,
                              struct ring_buffer *buffer,
                              struct ring_buffer_event *event)
 {
        if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
            !filter_match_preds(call->filter, rec)) {
-               ring_buffer_discard_commit(buffer, event);
+               __trace_event_discard_commit(buffer, event);
                return 1;
        }
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(call_filter_check_discard);
 
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
@@ -1672,6 +1660,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+                 int type, unsigned long flags, int pc)
+{
+       struct trace_entry *ent = ring_buffer_event_data(event);
+
+       tracing_generic_entry_update(ent, flags, pc);
+       ent->type = type;
+}
+
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct ring_buffer *buffer,
                          int type,
@@ -1681,34 +1679,137 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
 
        event = ring_buffer_lock_reserve(buffer, len);
-       if (event != NULL) {
-               struct trace_entry *ent = ring_buffer_event_data(event);
+       if (event != NULL)
+               trace_event_setup(event, type, flags, pc);
+
+       return event;
+}
+
+DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
+DEFINE_PER_CPU(int, trace_buffered_event_cnt);
+static int trace_buffered_event_ref;
+
+/**
+ * trace_buffered_event_enable - enable buffering events
+ *
+ * When events are being filtered, it is quicker to use a temporary
+ * buffer to write the event data into if there's a likely chance
+ * that it will not be committed. The discard of the ring buffer
+ * is not as fast as committing, and is much slower than copying
+ * a commit.
+ *
+ * When an event is to be filtered, allocate per cpu buffers to
+ * write the event data into, and if the event is filtered and discarded
+ * it is simply dropped, otherwise, the entire data is to be committed
+ * in one shot.
+ */
+void trace_buffered_event_enable(void)
+{
+       struct ring_buffer_event *event;
+       struct page *page;
+       int cpu;
+
+       WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+       if (trace_buffered_event_ref++)
+               return;
+
+       for_each_tracing_cpu(cpu) {
+               page = alloc_pages_node(cpu_to_node(cpu),
+                                       GFP_KERNEL | __GFP_NORETRY, 0);
+               if (!page)
+                       goto failed;
+
+               event = page_address(page);
+               memset(event, 0, sizeof(*event));
 
-               tracing_generic_entry_update(ent, flags, pc);
-               ent->type = type;
+               per_cpu(trace_buffered_event, cpu) = event;
+
+               preempt_disable();
+               if (cpu == smp_processor_id() &&
+                   this_cpu_read(trace_buffered_event) !=
+                   per_cpu(trace_buffered_event, cpu))
+                       WARN_ON_ONCE(1);
+               preempt_enable();
        }
 
-       return event;
+       return;
+ failed:
+       trace_buffered_event_disable();
 }
 
-void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+static void enable_trace_buffered_event(void *data)
 {
-       __this_cpu_write(trace_cmdline_save, true);
-       ring_buffer_unlock_commit(buffer, event);
+       /* Probably not needed, but do it anyway */
+       smp_rmb();
+       this_cpu_dec(trace_buffered_event_cnt);
 }
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-                               struct ring_buffer *buffer,
-                               struct ring_buffer_event *event,
-                               unsigned long flags, int pc)
+static void disable_trace_buffered_event(void *data)
 {
-       __buffer_unlock_commit(buffer, event);
+       this_cpu_inc(trace_buffered_event_cnt);
+}
 
-       ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
-       ftrace_trace_userstack(buffer, flags, pc);
+/**
+ * trace_buffered_event_disable - disable buffering events
+ *
+ * When a filter is removed, it is faster to not use the buffered
+ * events, and to commit directly into the ring buffer. Free up
+ * the temp buffers when there are no more users. This requires
+ * special synchronization with current events.
+ */
+void trace_buffered_event_disable(void)
+{
+       int cpu;
+
+       WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+       if (WARN_ON_ONCE(!trace_buffered_event_ref))
+               return;
+
+       if (--trace_buffered_event_ref)
+               return;
+
+       preempt_disable();
+       /* For each CPU, set the buffer as used. */
+       smp_call_function_many(tracing_buffer_mask,
+                              disable_trace_buffered_event, NULL, 1);
+       preempt_enable();
+
+       /* Wait for all current users to finish */
+       synchronize_sched();
+
+       for_each_tracing_cpu(cpu) {
+               free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
+               per_cpu(trace_buffered_event, cpu) = NULL;
+       }
+       /*
+        * Make sure trace_buffered_event is NULL before clearing
+        * trace_buffered_event_cnt.
+        */
+       smp_wmb();
+
+       preempt_disable();
+       /* Do the work on each cpu */
+       smp_call_function_many(tracing_buffer_mask,
+                              enable_trace_buffered_event, NULL, 1);
+       preempt_enable();
+}
+
+void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+       __this_cpu_write(trace_cmdline_save, true);
+
+       /* If this is the temp buffer, we need to commit fully */
+       if (this_cpu_read(trace_buffered_event) == event) {
+               /* Length is in event->array[0] */
+               ring_buffer_write(buffer, event->array[0], &event->array[1]);
+               /* Release the temp buffer */
+               this_cpu_dec(trace_buffered_event_cnt);
+       } else
+               ring_buffer_unlock_commit(buffer, event);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
 static struct ring_buffer *temp_buffer;
 
@@ -1719,8 +1820,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
                          unsigned long flags, int pc)
 {
        struct ring_buffer_event *entry;
+       int val;
 
        *current_rb = trace_file->tr->trace_buffer.buffer;
+
+       if ((trace_file->flags &
+            (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+           (entry = this_cpu_read(trace_buffered_event))) {
+               /* Try to use the per cpu buffer first */
+               val = this_cpu_inc_return(trace_buffered_event_cnt);
+               if (val == 1) {
+                       trace_event_setup(entry, type, flags, pc);
+                       entry->array[0] = len;
+                       return entry;
+               }
+               this_cpu_dec(trace_buffered_event_cnt);
+       }
+
        entry = trace_buffer_lock_reserve(*current_rb,
                                         type, len, flags, pc);
        /*
@@ -1738,17 +1854,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
-                                 int type, unsigned long len,
-                                 unsigned long flags, int pc)
-{
-       *current_rb = global_trace.trace_buffer.buffer;
-       return trace_buffer_lock_reserve(*current_rb,
-                                        type, len, flags, pc);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
-
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
                                     struct ring_buffer *buffer,
                                     struct ring_buffer_event *event,
@@ -1760,14 +1865,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
        ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
        ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
-
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-                                        struct ring_buffer_event *event)
-{
-       ring_buffer_discard_commit(buffer, event);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
 
 void
 trace_function(struct trace_array *tr,
@@ -3571,6 +3668,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
        if (mask == TRACE_ITER_RECORD_CMD)
                trace_event_enable_cmd_record(enabled);
 
+       if (mask == TRACE_ITER_EVENT_FORK)
+               trace_event_follow_fork(tr, enabled);
+
        if (mask == TRACE_ITER_OVERWRITE) {
                ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -3658,7 +3758,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
        if (cnt >= sizeof(buf))
                return -EINVAL;
 
-       if (copy_from_user(&buf, ubuf, cnt))
+       if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;
 
        buf[cnt] = 0;
@@ -3804,11 +3904,18 @@ static const char readme_msg[] =
        "\t   trigger: traceon, traceoff\n"
        "\t            enable_event:<system>:<event>\n"
        "\t            disable_event:<system>:<event>\n"
+#ifdef CONFIG_HIST_TRIGGERS
+       "\t            enable_hist:<system>:<event>\n"
+       "\t            disable_hist:<system>:<event>\n"
+#endif
 #ifdef CONFIG_STACKTRACE
        "\t\t    stacktrace\n"
 #endif
 #ifdef CONFIG_TRACER_SNAPSHOT
        "\t\t    snapshot\n"
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+       "\t\t    hist (see below)\n"
 #endif
        "\t   example: echo traceoff > events/block/block_unplug/trigger\n"
        "\t            echo traceoff:3 > events/block/block_unplug/trigger\n"
@@ -3825,6 +3932,56 @@ static const char readme_msg[] =
        "\t   To remove a trigger with a count:\n"
        "\t     echo '!<trigger>:0 > <system>/<event>/trigger\n"
        "\t   Filters can be ignored when removing a trigger.\n"
+#ifdef CONFIG_HIST_TRIGGERS
+       "      hist trigger\t- If set, event hits are aggregated into a hash table\n"
+       "\t    Format: hist:keys=<field1[,field2,...]>\n"
+       "\t            [:values=<field1[,field2,...]>]\n"
+       "\t            [:sort=<field1[,field2,...]>]\n"
+       "\t            [:size=#entries]\n"
+       "\t            [:pause][:continue][:clear]\n"
+       "\t            [:name=histname1]\n"
+       "\t            [if <filter>]\n\n"
+       "\t    When a matching event is hit, an entry is added to a hash\n"
+       "\t    table using the key(s) and value(s) named, and the value of a\n"
+       "\t    sum called 'hitcount' is incremented.  Keys and values\n"
+       "\t    correspond to fields in the event's format description.  Keys\n"
+       "\t    can be any field, or the special string 'stacktrace'.\n"
+       "\t    Compound keys consisting of up to two fields can be specified\n"
+       "\t    by the 'keys' keyword.  Values must correspond to numeric\n"
+       "\t    fields.  Sort keys consisting of up to two fields can be\n"
+       "\t    specified using the 'sort' keyword.  The sort direction can\n"
+       "\t    be modified by appending '.descending' or '.ascending' to a\n"
+       "\t    sort field.  The 'size' parameter can be used to specify more\n"
+       "\t    or fewer than the default 2048 entries for the hashtable size.\n"
+       "\t    If a hist trigger is given a name using the 'name' parameter,\n"
+       "\t    its histogram data will be shared with other triggers of the\n"
+       "\t    same name, and trigger hits will update this common data.\n\n"
+       "\t    Reading the 'hist' file for the event will dump the hash\n"
+       "\t    table in its entirety to stdout.  If there are multiple hist\n"
+       "\t    triggers attached to an event, there will be a table for each\n"
+       "\t    trigger in the output.  The table displayed for a named\n"
+       "\t    trigger will be the same as any other instance having the\n"
+       "\t    same name.  The default format used to display a given field\n"
+       "\t    can be modified by appending any of the following modifiers\n"
+       "\t    to the field name, as applicable:\n\n"
+       "\t            .hex        display a number as a hex value\n"
+       "\t            .sym        display an address as a symbol\n"
+       "\t            .sym-offset display an address as a symbol and offset\n"
+       "\t            .execname   display a common_pid as a program name\n"
+       "\t            .syscall    display a syscall id as a syscall name\n\n"
+       "\t            .log2       display log2 value rather than raw number\n\n"
+       "\t    The 'pause' parameter can be used to pause an existing hist\n"
+       "\t    trigger or to start a hist trigger but not log any events\n"
+       "\t    until told to do so.  'continue' can be used to start or\n"
+       "\t    restart a paused hist trigger.\n\n"
+       "\t    The 'clear' parameter will clear the contents of a running\n"
+       "\t    hist trigger and leave its current paused/active state\n"
+       "\t    unchanged.\n\n"
+       "\t    The enable_hist and disable_hist triggers can be used to\n"
+       "\t    have one event conditionally start and stop another event's\n"
+       "\t    already-attached hist trigger.  The syntax is analagous to\n"
+       "\t    the enable_event and disable_event triggers.\n"
+#endif
 ;
 
 static ssize_t
@@ -4474,7 +4631,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
        if (cnt > MAX_TRACER_SIZE)
                cnt = MAX_TRACER_SIZE;
 
-       if (copy_from_user(&buf, ubuf, cnt))
+       if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;
 
        buf[cnt] = 0;
@@ -4733,19 +4890,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
        struct trace_iterator *iter = filp->private_data;
        ssize_t sret;
 
-       /* return any leftover data */
-       sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-       if (sret != -EBUSY)
-               return sret;
-
-       trace_seq_init(&iter->seq);
-
        /*
         * Avoid more than one consumer on a single file descriptor
         * This is just a matter of traces coherency, the ring buffer itself
         * is protected.
         */
        mutex_lock(&iter->mutex);
+
+       /* return any leftover data */
+       sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+       if (sret != -EBUSY)
+               goto out;
+
+       trace_seq_init(&iter->seq);
+
        if (iter->trace->read) {
                sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
                if (sret)
@@ -5264,7 +5422,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
        if (cnt >= sizeof(buf))
                return -EINVAL;
 
-       if (copy_from_user(&buf, ubuf, cnt))
+       if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;
 
        buf[cnt] = 0;
@@ -5772,9 +5930,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                return -EBUSY;
 #endif
 
-       if (splice_grow_spd(pipe, &spd))
-               return -ENOMEM;
-
        if (*ppos & (PAGE_SIZE - 1))
                return -EINVAL;
 
@@ -5784,6 +5939,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
 
+       if (splice_grow_spd(pipe, &spd))
+               return -ENOMEM;
+
  again:
        trace_access_lock(iter->cpu_file);
        entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5841,19 +5999,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        /* did we read anything? */
        if (!spd.nr_pages) {
                if (ret)
-                       return ret;
+                       goto out;
 
+               ret = -EAGAIN;
                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
-                       return -EAGAIN;
+                       goto out;
 
                ret = wait_on_pipe(iter, true);
                if (ret)
-                       return ret;
+                       goto out;
 
                goto again;
        }
 
        ret = splice_to_pipe(pipe, &spd);
+out:
        splice_shrink_spd(&spd);
 
        return ret;
@@ -6650,7 +6810,7 @@ static int instance_mkdir(const char *name)
        if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
                goto out_free_tr;
 
-       tr->trace_flags = global_trace.trace_flags;
+       tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
        cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
 
@@ -6724,6 +6884,12 @@ static int instance_rmdir(const char *name)
 
        list_del(&tr->list);
 
+       /* Disable all the flags that were enabled coming in */
+       for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
+               if ((1 << i) & ZEROED_TRACE_FLAGS)
+                       set_tracer_flag(tr, 1 << i, 0);
+       }
+
        tracing_set_nop(tr);
        event_trace_del_tracer(tr);
        ftrace_destroy_function_files(tr);