perf tools: Construct LBR call chain
authorKan Liang <kan.liang@intel.com>
Mon, 5 Jan 2015 18:23:05 +0000 (13:23 -0500)
committerIngo Molnar <mingo@kernel.org>
Wed, 18 Feb 2015 16:16:18 +0000 (17:16 +0100)
LBR call stack only has user-space callchains. It is output in the
PERF_SAMPLE_BRANCH_STACK data format. For kernel callchains, it's
still in the form of PERF_SAMPLE_CALLCHAIN.

The perf tool has to handle both data sources to construct a
complete callstack.

For the "perf report -D" option, both lbr and fp information will be
displayed.

A new call chain recording option "lbr" is introduced into the perf
tool for LBR call stack. The user can use --call-graph lbr to get
the call stack information from hardware.

Here are some examples.

When profiling bc(1) on Fedora 19:

  echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph lbr bc -l < cmd

If enabling LBR, perf report output looks like:

    50.36%       bc  bc                 [.] bc_divide
                 |
                 --- bc_divide
                     execute
                     run_code
                     yyparse
                     main
                     __libc_start_main
                     _start
    33.66%       bc  bc                 [.] _one_mult
                 |
                 --- _one_mult
                     bc_divide
                     execute
                     run_code
                     yyparse
                     main
                     __libc_start_main
                     _start
     7.62%       bc  bc                 [.] _bc_do_add
                 |
                 --- _bc_do_add
                    |
                    |--99.89%-- 0x2000186a8
                     --0.11%-- [...]
     6.83%       bc  bc                 [.] _bc_do_sub
                 |
                 --- _bc_do_sub
                    |
                    |--99.94%-- bc_add
                    |          execute
                    |          run_code
                    |          yyparse
                    |          main
                    |          __libc_start_main
                    |          _start
                     --0.06%-- [...]
     0.46%       bc  libc-2.17.so       [.] __memset_sse2
                 |
                 --- __memset_sse2
                    |
                    |--54.13%-- bc_new_num
                    |          |
                    |          |--51.00%-- bc_divide
                    |          |          execute
                    |          |          run_code
                    |          |          yyparse
                    |          |          main
                    |          |          __libc_start_main
                    |          |          _start
                    |          |
                    |          |--30.46%-- _bc_do_sub
                    |          |          bc_add
                    |          |          execute
                    |          |          run_code
                    |          |          yyparse
                    |          |          main
                    |          |          __libc_start_main
                    |          |          _start
                    |          |
                    |           --18.55%-- _bc_do_add
                    |                     bc_add
                    |                     execute
                    |                     run_code
                    |                     yyparse
                    |                     main
                    |                     __libc_start_main
                    |                     _start
                    |
                     --45.87%-- bc_divide
                               execute
                               run_code
                               yyparse
                               main
                               __libc_start_main
                               _start

If using FP, perf report output looks like:

  echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph fp bc -l < cmd

    50.49%       bc  bc                 [.] bc_divide
                 |
                 --- bc_divide
    33.57%       bc  bc                 [.] _one_mult
                 |
                 --- _one_mult
     7.61%       bc  bc                 [.] _bc_do_add
                 |
                 --- _bc_do_add
                     0x2000186a8
     6.88%       bc  bc                 [.] _bc_do_sub
                 |
                 --- _bc_do_sub
     0.42%       bc  libc-2.17.so       [.] __memcpy_ssse3_back
                 |
                 --- __memcpy_ssse3_back

If using LBR, perf report -D output looks like:

3458145275743 0x2fd750 [0xd8]: PERF_RECORD_SAMPLE(IP, 0x2): 9748/9748: 0x408ea8 period: 609644 addr: 0
... LBR call chain: nr:8
.....  0: fffffffffffffe00
.....  1: 0000000000408e50
.....  2: 000000000040a458
.....  3: 000000000040562e
.....  4: 0000000000408590
.....  5: 00000000004022c0
.....  6: 00000000004015dd
.....  7: 0000003d1cc21b43
... FP chain: nr:2
.....  0: fffffffffffffe00
.....  1: 0000000000408ea8
 ... thread: bc:9748
 ...... dso: /usr/bin/bc

The LBR call stack has the following known limitations:

 - Zero length calls are not filtered out by the hardware

 - Exception handing such as setjmp/longjmp will have calls/returns not
   match

 - Pushing different return address onto the stack will have
   calls/returns not match

 - If callstack is deeper than the LBR, only the last entries are
   captured

Tested-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Ahern <dsahern@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Simon Que <sque@chromium.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1420482185-29830-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
tools/perf/util/evsel.h
tools/perf/util/machine.c
tools/perf/util/session.c

index 3862274..dcf202a 100644 (file)
@@ -355,4 +355,8 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node);  \
      (_evsel) && (_evsel)->leader == (_leader);                                        \
      (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node))
 
+static inline bool has_branch_callstack(struct perf_evsel *evsel)
+{
+       return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
+}
 #endif /* __PERF_EVSEL_H */
index 1bca3a9..9e0f60a 100644 (file)
@@ -1502,18 +1502,100 @@ static int remove_loops(struct branch_entry *l, int nr)
        return nr;
 }
 
-static int thread__resolve_callchain_sample(struct thread *thread,
-                                            struct ip_callchain *chain,
-                                            struct branch_stack *branch,
-                                            struct symbol **parent,
-                                            struct addr_location *root_al,
-                                            int max_stack)
+/*
+ * Recolve LBR callstack chain sample
+ * Return:
+ * 1 on success get LBR callchain information
+ * 0 no available LBR callchain information, should try fp
+ * negative error code on other errors.
+ */
+static int resolve_lbr_callchain_sample(struct thread *thread,
+                                       struct perf_sample *sample,
+                                       struct symbol **parent,
+                                       struct addr_location *root_al,
+                                       int max_stack)
 {
+       struct ip_callchain *chain = sample->callchain;
+       int chain_nr = min(max_stack, (int)chain->nr);
+       int i, j, err;
+       u64 ip;
+
+       for (i = 0; i < chain_nr; i++) {
+               if (chain->ips[i] == PERF_CONTEXT_USER)
+                       break;
+       }
+
+       /* LBR only affects the user callchain */
+       if (i != chain_nr) {
+               struct branch_stack *lbr_stack = sample->branch_stack;
+               int lbr_nr = lbr_stack->nr;
+               /*
+                * LBR callstack can only get user call chain.
+                * The mix_chain_nr is kernel call chain
+                * number plus LBR user call chain number.
+                * i is kernel call chain number,
+                * 1 is PERF_CONTEXT_USER,
+                * lbr_nr + 1 is the user call chain number.
+                * For details, please refer to the comments
+                * in callchain__printf
+                */
+               int mix_chain_nr = i + 1 + lbr_nr + 1;
+
+               if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
+                       pr_warning("corrupted callchain. skipping...\n");
+                       return 0;
+               }
+
+               for (j = 0; j < mix_chain_nr; j++) {
+                       if (callchain_param.order == ORDER_CALLEE) {
+                               if (j < i + 1)
+                                       ip = chain->ips[j];
+                               else if (j > i + 1)
+                                       ip = lbr_stack->entries[j - i - 2].from;
+                               else
+                                       ip = lbr_stack->entries[0].to;
+                       } else {
+                               if (j < lbr_nr)
+                                       ip = lbr_stack->entries[lbr_nr - j - 1].from;
+                               else if (j > lbr_nr)
+                                       ip = chain->ips[i + 1 - (j - lbr_nr)];
+                               else
+                                       ip = lbr_stack->entries[0].to;
+                       }
+
+                       err = add_callchain_ip(thread, parent, root_al, false, ip);
+                       if (err)
+                               return (err < 0) ? err : 0;
+               }
+               return 1;
+       }
+
+       return 0;
+}
+
+static int thread__resolve_callchain_sample(struct thread *thread,
+                                           struct perf_evsel *evsel,
+                                           struct perf_sample *sample,
+                                           struct symbol **parent,
+                                           struct addr_location *root_al,
+                                           int max_stack)
+{
+       struct branch_stack *branch = sample->branch_stack;
+       struct ip_callchain *chain = sample->callchain;
        int chain_nr = min(max_stack, (int)chain->nr);
        int i, j, err;
        int skip_idx = -1;
        int first_call = 0;
 
+       callchain_cursor_reset(&callchain_cursor);
+
+       if (has_branch_callstack(evsel)) {
+               err = resolve_lbr_callchain_sample(thread, sample, parent,
+                                                  root_al, max_stack);
+               if (err)
+                       return (err < 0) ? err : 0;
+       }
+
        /*
         * Based on DWARF debug information, some architectures skip
         * a callchain entry saved by the kernel.
@@ -1521,8 +1603,6 @@ static int thread__resolve_callchain_sample(struct thread *thread,
        if (chain->nr < PERF_MAX_STACK_DEPTH)
                skip_idx = arch_skip_callchain_idx(thread, chain);
 
-       callchain_cursor_reset(&callchain_cursor);
-
        /*
         * Add branches to call stack for easier browsing. This gives
         * more context for a sample than just the callers.
@@ -1623,9 +1703,9 @@ int thread__resolve_callchain(struct thread *thread,
                              struct addr_location *root_al,
                              int max_stack)
 {
-       int ret = thread__resolve_callchain_sample(thread, sample->callchain,
-                                                  sample->branch_stack,
-                                                  parent, root_al, max_stack);
+       int ret = thread__resolve_callchain_sample(thread, evsel,
+                                                  sample, parent,
+                                                  root_al, max_stack);
        if (ret)
                return ret;
 
index 0baf75f..504b7e6 100644 (file)
@@ -553,15 +553,67 @@ int perf_session_queue_event(struct perf_session *s, union perf_event *event,
        return 0;
 }
 
-static void callchain__printf(struct perf_sample *sample)
+static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 {
+       struct ip_callchain *callchain = sample->callchain;
+       struct branch_stack *lbr_stack = sample->branch_stack;
+       u64 kernel_callchain_nr = callchain->nr;
        unsigned int i;
 
-       printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr);
+       for (i = 0; i < kernel_callchain_nr; i++) {
+               if (callchain->ips[i] == PERF_CONTEXT_USER)
+                       break;
+       }
+
+       if ((i != kernel_callchain_nr) && lbr_stack->nr) {
+               u64 total_nr;
+               /*
+                * LBR callstack can only get user call chain,
+                * i is kernel call chain number,
+                * 1 is PERF_CONTEXT_USER.
+                *
+                * The user call chain is stored in LBR registers.
+                * LBR are pair registers. The caller is stored
+                * in "from" register, while the callee is stored
+                * in "to" register.
+                * For example, there is a call stack
+                * "A"->"B"->"C"->"D".
+                * The LBR registers will recorde like
+                * "C"->"D", "B"->"C", "A"->"B".
+                * So only the first "to" register and all "from"
+                * registers are needed to construct the whole stack.
+                */
+               total_nr = i + 1 + lbr_stack->nr + 1;
+               kernel_callchain_nr = i + 1;
+
+               printf("... LBR call chain: nr:%" PRIu64 "\n", total_nr);
+
+               for (i = 0; i < kernel_callchain_nr; i++)
+                       printf("..... %2d: %016" PRIx64 "\n",
+                              i, callchain->ips[i]);
+
+               printf("..... %2d: %016" PRIx64 "\n",
+                      (int)(kernel_callchain_nr), lbr_stack->entries[0].to);
+               for (i = 0; i < lbr_stack->nr; i++)
+                       printf("..... %2d: %016" PRIx64 "\n",
+                              (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
+       }
+}
+
+static void callchain__printf(struct perf_evsel *evsel,
+                             struct perf_sample *sample)
+{
+       unsigned int i;
+       struct ip_callchain *callchain = sample->callchain;
+
+       if (has_branch_callstack(evsel))
+               callchain__lbr_callstack_printf(sample);
+
+       printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
 
-       for (i = 0; i < sample->callchain->nr; i++)
+       for (i = 0; i < callchain->nr; i++)
                printf("..... %2d: %016" PRIx64 "\n",
-                      i, sample->callchain->ips[i]);
+                      i, callchain->ips[i]);
 }
 
 static void branch_stack__printf(struct perf_sample *sample)
@@ -718,9 +770,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
        sample_type = evsel->attr.sample_type;
 
        if (sample_type & PERF_SAMPLE_CALLCHAIN)
-               callchain__printf(sample);
+               callchain__printf(evsel, sample);
 
-       if (sample_type & PERF_SAMPLE_BRANCH_STACK)
+       if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
                branch_stack__printf(sample);
 
        if (sample_type & PERF_SAMPLE_REGS_USER)