Merge tag 'mac80211-for-davem-2016-07-06' of git://git.kernel.org/pub/scm/linux/kerne...
[cascardo/linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <sys/ptrace.h>
47 #include <linux/random.h>
48 #include <linux/stringify.h>
49
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC              02000000
52 #endif
53
54 struct trace {
55         struct perf_tool        tool;
56         struct syscalltbl       *sctbl;
57         struct {
58                 int             max;
59                 struct syscall  *table;
60                 struct {
61                         struct perf_evsel *sys_enter,
62                                           *sys_exit;
63                 }               events;
64         } syscalls;
65         struct record_opts      opts;
66         struct perf_evlist      *evlist;
67         struct machine          *host;
68         struct thread           *current;
69         u64                     base_time;
70         FILE                    *output;
71         unsigned long           nr_events;
72         struct strlist          *ev_qualifier;
73         struct {
74                 size_t          nr;
75                 int             *entries;
76         }                       ev_qualifier_ids;
77         struct intlist          *tid_list;
78         struct intlist          *pid_list;
79         struct {
80                 size_t          nr;
81                 pid_t           *entries;
82         }                       filter_pids;
83         double                  duration_filter;
84         double                  runtime_ms;
85         struct {
86                 u64             vfs_getname,
87                                 proc_getname;
88         } stats;
89         unsigned int            max_stack;
90         unsigned int            min_stack;
91         bool                    not_ev_qualifier;
92         bool                    live;
93         bool                    full_time;
94         bool                    sched;
95         bool                    multiple_threads;
96         bool                    summary;
97         bool                    summary_only;
98         bool                    show_comm;
99         bool                    show_tool_stats;
100         bool                    trace_syscalls;
101         bool                    kernel_syscallchains;
102         bool                    force;
103         bool                    vfs_getname;
104         int                     trace_pgfaults;
105         int                     open_id;
106 };
107
108 struct tp_field {
109         int offset;
110         union {
111                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113         };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119         u##bits value; \
120         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121         return value;  \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132         u##bits value; \
133         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134         return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142                                struct format_field *format_field,
143                                bool needs_swap)
144 {
145         field->offset = format_field->offset;
146
147         switch (format_field->size) {
148         case 1:
149                 field->integer = tp_field__u8;
150                 break;
151         case 2:
152                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153                 break;
154         case 4:
155                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156                 break;
157         case 8:
158                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159                 break;
160         default:
161                 return -1;
162         }
163
164         return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169         return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174         field->offset = format_field->offset;
175         field->pointer = tp_field__ptr;
176         return 0;
177 }
178
179 struct syscall_tp {
180         struct tp_field id;
181         union {
182                 struct tp_field args, ret;
183         };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187                                           struct tp_field *field,
188                                           const char *name)
189 {
190         struct format_field *format_field = perf_evsel__field(evsel, name);
191
192         if (format_field == NULL)
193                 return -1;
194
195         return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199         ({ struct syscall_tp *sc = evsel->priv;\
200            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203                                          struct tp_field *field,
204                                          const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220         zfree(&evsel->priv);
221         perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226         evsel->priv = malloc(sizeof(struct syscall_tp));
227         if (evsel->priv != NULL) {
228                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229                         goto out_delete;
230
231                 evsel->handler = handler;
232                 return 0;
233         }
234
235         return -ENOMEM;
236
237 out_delete:
238         zfree(&evsel->priv);
239         return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247         if (IS_ERR(evsel))
248                 evsel = perf_evsel__newtp("syscalls", direction);
249
250         if (IS_ERR(evsel))
251                 return NULL;
252
253         if (perf_evsel__init_syscall_tp(evsel, handler))
254                 goto out_delete;
255
256         return evsel;
257
258 out_delete:
259         perf_evsel__delete_priv(evsel);
260         return NULL;
261 }
262
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264         ({ struct syscall_tp *fields = evsel->priv; \
265            fields->name.integer(&fields->name, sample); })
266
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268         ({ struct syscall_tp *fields = evsel->priv; \
269            fields->name.pointer(&fields->name, sample); })
270
271 struct syscall_arg {
272         unsigned long val;
273         struct thread *thread;
274         struct trace  *trace;
275         void          *parm;
276         u8            idx;
277         u8            mask;
278 };
279
280 struct strarray {
281         int         offset;
282         int         nr_entries;
283         const char **entries;
284 };
285
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287         .nr_entries = ARRAY_SIZE(array), \
288         .entries = array, \
289 }
290
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292         .offset     = off, \
293         .nr_entries = ARRAY_SIZE(array), \
294         .entries = array, \
295 }
296
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298                                                 const char *intfmt,
299                                                 struct syscall_arg *arg)
300 {
301         struct strarray *sa = arg->parm;
302         int idx = arg->val - sa->offset;
303
304         if (idx < 0 || idx >= sa->nr_entries)
305                 return scnprintf(bf, size, intfmt, arg->val);
306
307         return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311                                               struct syscall_arg *arg)
312 {
313         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  *        gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324                                                  struct syscall_arg *arg)
325 {
326         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333                                         struct syscall_arg *arg);
334
335 #define SCA_FD syscall_arg__scnprintf_fd
336
337 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
338                                            struct syscall_arg *arg)
339 {
340         int fd = arg->val;
341
342         if (fd == AT_FDCWD)
343                 return scnprintf(bf, size, "CWD");
344
345         return syscall_arg__scnprintf_fd(bf, size, arg);
346 }
347
348 #define SCA_FDAT syscall_arg__scnprintf_fd_at
349
350 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
351                                               struct syscall_arg *arg);
352
353 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
354
355 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
356                                          struct syscall_arg *arg)
357 {
358         return scnprintf(bf, size, "%#lx", arg->val);
359 }
360
361 #define SCA_HEX syscall_arg__scnprintf_hex
362
363 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
364                                          struct syscall_arg *arg)
365 {
366         return scnprintf(bf, size, "%d", arg->val);
367 }
368
369 #define SCA_INT syscall_arg__scnprintf_int
370
371 static const char *bpf_cmd[] = {
372         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
373         "MAP_GET_NEXT_KEY", "PROG_LOAD",
374 };
375 static DEFINE_STRARRAY(bpf_cmd);
376
377 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
378 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
379
380 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
381 static DEFINE_STRARRAY(itimers);
382
383 static const char *keyctl_options[] = {
384         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
385         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
386         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
387         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
388         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
389 };
390 static DEFINE_STRARRAY(keyctl_options);
391
392 static const char *whences[] = { "SET", "CUR", "END",
393 #ifdef SEEK_DATA
394 "DATA",
395 #endif
396 #ifdef SEEK_HOLE
397 "HOLE",
398 #endif
399 };
400 static DEFINE_STRARRAY(whences);
401
402 static const char *fcntl_cmds[] = {
403         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
404         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
405         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
406         "F_GETOWNER_UIDS",
407 };
408 static DEFINE_STRARRAY(fcntl_cmds);
409
410 static const char *rlimit_resources[] = {
411         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
412         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
413         "RTTIME",
414 };
415 static DEFINE_STRARRAY(rlimit_resources);
416
417 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
418 static DEFINE_STRARRAY(sighow);
419
420 static const char *clockid[] = {
421         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
422         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
423         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
424 };
425 static DEFINE_STRARRAY(clockid);
426
427 static const char *socket_families[] = {
428         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
429         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
430         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
431         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
432         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
433         "ALG", "NFC", "VSOCK",
434 };
435 static DEFINE_STRARRAY(socket_families);
436
437 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
438                                                  struct syscall_arg *arg)
439 {
440         size_t printed = 0;
441         int mode = arg->val;
442
443         if (mode == F_OK) /* 0 */
444                 return scnprintf(bf, size, "F");
445 #define P_MODE(n) \
446         if (mode & n##_OK) { \
447                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
448                 mode &= ~n##_OK; \
449         }
450
451         P_MODE(R);
452         P_MODE(W);
453         P_MODE(X);
454 #undef P_MODE
455
456         if (mode)
457                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
458
459         return printed;
460 }
461
462 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
463
464 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
465                                               struct syscall_arg *arg);
466
467 #define SCA_FILENAME syscall_arg__scnprintf_filename
468
469 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
470                                                 struct syscall_arg *arg)
471 {
472         int printed = 0, flags = arg->val;
473
474 #define P_FLAG(n) \
475         if (flags & O_##n) { \
476                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
477                 flags &= ~O_##n; \
478         }
479
480         P_FLAG(CLOEXEC);
481         P_FLAG(NONBLOCK);
482 #undef P_FLAG
483
484         if (flags)
485                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
486
487         return printed;
488 }
489
490 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
491
492 #if defined(__i386__) || defined(__x86_64__)
493 /*
494  * FIXME: Make this available to all arches.
495  */
496 #define TCGETS          0x5401
497
498 static const char *tioctls[] = {
499         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
500         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
501         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
502         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
503         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
504         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
505         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
506         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
507         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
508         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
509         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
510         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
511         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
512         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
513         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
514 };
515
516 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
517 #endif /* defined(__i386__) || defined(__x86_64__) */
518
519 #ifndef GRND_NONBLOCK
520 #define GRND_NONBLOCK   0x0001
521 #endif
522 #ifndef GRND_RANDOM
523 #define GRND_RANDOM     0x0002
524 #endif
525
526 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
527                                                    struct syscall_arg *arg)
528 {
529         int printed = 0, flags = arg->val;
530
531 #define P_FLAG(n) \
532         if (flags & GRND_##n) { \
533                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
534                 flags &= ~GRND_##n; \
535         }
536
537         P_FLAG(RANDOM);
538         P_FLAG(NONBLOCK);
539 #undef P_FLAG
540
541         if (flags)
542                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
543
544         return printed;
545 }
546
547 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
548
549 #define STRARRAY(arg, name, array) \
550           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
551           .arg_parm      = { [arg] = &strarray__##array, }
552
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/flock.c"
555 #include "trace/beauty/futex_op.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567
568 static struct syscall_fmt {
569         const char *name;
570         const char *alias;
571         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
572         void       *arg_parm[6];
573         bool       errmsg;
574         bool       errpid;
575         bool       timeout;
576         bool       hexret;
577 } syscall_fmts[] = {
578         { .name     = "access",     .errmsg = true,
579           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
580         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
581         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
582         { .name     = "brk",        .hexret = true,
583           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
584         { .name     = "chdir",      .errmsg = true, },
585         { .name     = "chmod",      .errmsg = true, },
586         { .name     = "chroot",     .errmsg = true, },
587         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
588         { .name     = "clone",      .errpid = true, },
589         { .name     = "close",      .errmsg = true,
590           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
591         { .name     = "connect",    .errmsg = true, },
592         { .name     = "creat",      .errmsg = true, },
593         { .name     = "dup",        .errmsg = true, },
594         { .name     = "dup2",       .errmsg = true, },
595         { .name     = "dup3",       .errmsg = true, },
596         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
597         { .name     = "eventfd2",   .errmsg = true,
598           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
599         { .name     = "faccessat",  .errmsg = true, },
600         { .name     = "fadvise64",  .errmsg = true, },
601         { .name     = "fallocate",  .errmsg = true, },
602         { .name     = "fchdir",     .errmsg = true, },
603         { .name     = "fchmod",     .errmsg = true, },
604         { .name     = "fchmodat",   .errmsg = true,
605           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
606         { .name     = "fchown",     .errmsg = true, },
607         { .name     = "fchownat",   .errmsg = true,
608           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
609         { .name     = "fcntl",      .errmsg = true,
610           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
611           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
612         { .name     = "fdatasync",  .errmsg = true, },
613         { .name     = "flock",      .errmsg = true,
614           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
615         { .name     = "fsetxattr",  .errmsg = true, },
616         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
617         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
618         { .name     = "fstatfs",    .errmsg = true, },
619         { .name     = "fsync",    .errmsg = true, },
620         { .name     = "ftruncate", .errmsg = true, },
621         { .name     = "futex",      .errmsg = true,
622           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
623         { .name     = "futimesat", .errmsg = true,
624           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
625         { .name     = "getdents",   .errmsg = true, },
626         { .name     = "getdents64", .errmsg = true, },
627         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
628         { .name     = "getpid",     .errpid = true, },
629         { .name     = "getpgid",    .errpid = true, },
630         { .name     = "getppid",    .errpid = true, },
631         { .name     = "getrandom",  .errmsg = true,
632           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
633         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
634         { .name     = "getxattr",   .errmsg = true, },
635         { .name     = "inotify_add_watch",          .errmsg = true, },
636         { .name     = "ioctl",      .errmsg = true,
637           .arg_scnprintf = {
638 #if defined(__i386__) || defined(__x86_64__)
639 /*
640  * FIXME: Make this available to all arches.
641  */
642                              [1] = SCA_STRHEXARRAY, /* cmd */
643                              [2] = SCA_HEX, /* arg */ },
644           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
645 #else
646                              [2] = SCA_HEX, /* arg */ }, },
647 #endif
648         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
649         { .name     = "kill",       .errmsg = true,
650           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
651         { .name     = "lchown",    .errmsg = true, },
652         { .name     = "lgetxattr",  .errmsg = true, },
653         { .name     = "linkat",     .errmsg = true,
654           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
655         { .name     = "listxattr",  .errmsg = true, },
656         { .name     = "llistxattr", .errmsg = true, },
657         { .name     = "lremovexattr",  .errmsg = true, },
658         { .name     = "lseek",      .errmsg = true,
659           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
660           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
661         { .name     = "lsetxattr",  .errmsg = true, },
662         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
663         { .name     = "lsxattr",    .errmsg = true, },
664         { .name     = "madvise",    .errmsg = true,
665           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
666                              [2] = SCA_MADV_BHV, /* behavior */ }, },
667         { .name     = "mkdir",    .errmsg = true, },
668         { .name     = "mkdirat",    .errmsg = true,
669           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
670         { .name     = "mknod",      .errmsg = true, },
671         { .name     = "mknodat",    .errmsg = true,
672           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
673         { .name     = "mlock",      .errmsg = true,
674           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
675         { .name     = "mlockall",   .errmsg = true,
676           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
677         { .name     = "mmap",       .hexret = true,
678           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
679                              [2] = SCA_MMAP_PROT, /* prot */
680                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
681         { .name     = "mprotect",   .errmsg = true,
682           .arg_scnprintf = { [0] = SCA_HEX, /* start */
683                              [2] = SCA_MMAP_PROT, /* prot */ }, },
684         { .name     = "mq_unlink", .errmsg = true,
685           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
686         { .name     = "mremap",     .hexret = true,
687           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
688                              [3] = SCA_MREMAP_FLAGS, /* flags */
689                              [4] = SCA_HEX, /* new_addr */ }, },
690         { .name     = "munlock",    .errmsg = true,
691           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
692         { .name     = "munmap",     .errmsg = true,
693           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
694         { .name     = "name_to_handle_at", .errmsg = true,
695           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
696         { .name     = "newfstatat", .errmsg = true,
697           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
698         { .name     = "open",       .errmsg = true,
699           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
700         { .name     = "open_by_handle_at", .errmsg = true,
701           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
702                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
703         { .name     = "openat",     .errmsg = true,
704           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
705                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
706         { .name     = "perf_event_open", .errmsg = true,
707           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
708                              [3] = SCA_FD,  /* group_fd */
709                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
710         { .name     = "pipe2",      .errmsg = true,
711           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
712         { .name     = "poll",       .errmsg = true, .timeout = true, },
713         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
714         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
715         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
716         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
717         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
718         { .name     = "pwritev",    .errmsg = true, },
719         { .name     = "read",       .errmsg = true, },
720         { .name     = "readlink",   .errmsg = true, },
721         { .name     = "readlinkat", .errmsg = true,
722           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
723         { .name     = "readv",      .errmsg = true, },
724         { .name     = "recvfrom",   .errmsg = true,
725           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
726         { .name     = "recvmmsg",   .errmsg = true,
727           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
728         { .name     = "recvmsg",    .errmsg = true,
729           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
730         { .name     = "removexattr", .errmsg = true, },
731         { .name     = "renameat",   .errmsg = true,
732           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
733         { .name     = "rmdir",    .errmsg = true, },
734         { .name     = "rt_sigaction", .errmsg = true,
735           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
736         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
737         { .name     = "rt_sigqueueinfo", .errmsg = true,
738           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
739         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
740           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
741         { .name     = "sched_setscheduler",   .errmsg = true,
742           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
743         { .name     = "seccomp", .errmsg = true,
744           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
745                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
746         { .name     = "select",     .errmsg = true, .timeout = true, },
747         { .name     = "sendmmsg",    .errmsg = true,
748           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
749         { .name     = "sendmsg",    .errmsg = true,
750           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
751         { .name     = "sendto",     .errmsg = true,
752           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
753         { .name     = "set_tid_address", .errpid = true, },
754         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
755         { .name     = "setpgid",    .errmsg = true, },
756         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
757         { .name     = "setxattr",   .errmsg = true, },
758         { .name     = "shutdown",   .errmsg = true, },
759         { .name     = "socket",     .errmsg = true,
760           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
761                              [1] = SCA_SK_TYPE, /* type */ },
762           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
763         { .name     = "socketpair", .errmsg = true,
764           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
765                              [1] = SCA_SK_TYPE, /* type */ },
766           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
767         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
768         { .name     = "statfs",     .errmsg = true, },
769         { .name     = "swapoff",    .errmsg = true,
770           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
771         { .name     = "swapon",     .errmsg = true,
772           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
773         { .name     = "symlinkat",  .errmsg = true,
774           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
775         { .name     = "tgkill",     .errmsg = true,
776           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
777         { .name     = "tkill",      .errmsg = true,
778           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
779         { .name     = "truncate",   .errmsg = true, },
780         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
781         { .name     = "unlinkat",   .errmsg = true,
782           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
783         { .name     = "utime",  .errmsg = true, },
784         { .name     = "utimensat",  .errmsg = true,
785           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
786         { .name     = "utimes",  .errmsg = true, },
787         { .name     = "vmsplice",  .errmsg = true, },
788         { .name     = "wait4",      .errpid = true,
789           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
790         { .name     = "waitid",     .errpid = true,
791           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
792         { .name     = "write",      .errmsg = true, },
793         { .name     = "writev",     .errmsg = true, },
794 };
795
796 static int syscall_fmt__cmp(const void *name, const void *fmtp)
797 {
798         const struct syscall_fmt *fmt = fmtp;
799         return strcmp(name, fmt->name);
800 }
801
802 static struct syscall_fmt *syscall_fmt__find(const char *name)
803 {
804         const int nmemb = ARRAY_SIZE(syscall_fmts);
805         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
806 }
807
808 struct syscall {
809         struct event_format *tp_format;
810         int                 nr_args;
811         struct format_field *args;
812         const char          *name;
813         bool                is_exit;
814         struct syscall_fmt  *fmt;
815         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
816         void                **arg_parm;
817 };
818
819 static size_t fprintf_duration(unsigned long t, FILE *fp)
820 {
821         double duration = (double)t / NSEC_PER_MSEC;
822         size_t printed = fprintf(fp, "(");
823
824         if (duration >= 1.0)
825                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
826         else if (duration >= 0.01)
827                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
828         else
829                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
830         return printed + fprintf(fp, "): ");
831 }
832
833 /**
834  * filename.ptr: The filename char pointer that will be vfs_getname'd
835  * filename.entry_str_pos: Where to insert the string translated from
836  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
837  */
838 struct thread_trace {
839         u64               entry_time;
840         u64               exit_time;
841         bool              entry_pending;
842         unsigned long     nr_events;
843         unsigned long     pfmaj, pfmin;
844         char              *entry_str;
845         double            runtime_ms;
846         struct {
847                 unsigned long ptr;
848                 short int     entry_str_pos;
849                 bool          pending_open;
850                 unsigned int  namelen;
851                 char          *name;
852         } filename;
853         struct {
854                 int       max;
855                 char      **table;
856         } paths;
857
858         struct intlist *syscall_stats;
859 };
860
861 static struct thread_trace *thread_trace__new(void)
862 {
863         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
864
865         if (ttrace)
866                 ttrace->paths.max = -1;
867
868         ttrace->syscall_stats = intlist__new(NULL);
869
870         return ttrace;
871 }
872
873 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
874 {
875         struct thread_trace *ttrace;
876
877         if (thread == NULL)
878                 goto fail;
879
880         if (thread__priv(thread) == NULL)
881                 thread__set_priv(thread, thread_trace__new());
882
883         if (thread__priv(thread) == NULL)
884                 goto fail;
885
886         ttrace = thread__priv(thread);
887         ++ttrace->nr_events;
888
889         return ttrace;
890 fail:
891         color_fprintf(fp, PERF_COLOR_RED,
892                       "WARNING: not enough memory, dropping samples!\n");
893         return NULL;
894 }
895
896 #define TRACE_PFMAJ             (1 << 0)
897 #define TRACE_PFMIN             (1 << 1)
898
899 static const size_t trace__entry_str_size = 2048;
900
901 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
902 {
903         struct thread_trace *ttrace = thread__priv(thread);
904
905         if (fd > ttrace->paths.max) {
906                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
907
908                 if (npath == NULL)
909                         return -1;
910
911                 if (ttrace->paths.max != -1) {
912                         memset(npath + ttrace->paths.max + 1, 0,
913                                (fd - ttrace->paths.max) * sizeof(char *));
914                 } else {
915                         memset(npath, 0, (fd + 1) * sizeof(char *));
916                 }
917
918                 ttrace->paths.table = npath;
919                 ttrace->paths.max   = fd;
920         }
921
922         ttrace->paths.table[fd] = strdup(pathname);
923
924         return ttrace->paths.table[fd] != NULL ? 0 : -1;
925 }
926
927 static int thread__read_fd_path(struct thread *thread, int fd)
928 {
929         char linkname[PATH_MAX], pathname[PATH_MAX];
930         struct stat st;
931         int ret;
932
933         if (thread->pid_ == thread->tid) {
934                 scnprintf(linkname, sizeof(linkname),
935                           "/proc/%d/fd/%d", thread->pid_, fd);
936         } else {
937                 scnprintf(linkname, sizeof(linkname),
938                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
939         }
940
941         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
942                 return -1;
943
944         ret = readlink(linkname, pathname, sizeof(pathname));
945
946         if (ret < 0 || ret > st.st_size)
947                 return -1;
948
949         pathname[ret] = '\0';
950         return trace__set_fd_pathname(thread, fd, pathname);
951 }
952
953 static const char *thread__fd_path(struct thread *thread, int fd,
954                                    struct trace *trace)
955 {
956         struct thread_trace *ttrace = thread__priv(thread);
957
958         if (ttrace == NULL)
959                 return NULL;
960
961         if (fd < 0)
962                 return NULL;
963
964         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
965                 if (!trace->live)
966                         return NULL;
967                 ++trace->stats.proc_getname;
968                 if (thread__read_fd_path(thread, fd))
969                         return NULL;
970         }
971
972         return ttrace->paths.table[fd];
973 }
974
975 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
976                                         struct syscall_arg *arg)
977 {
978         int fd = arg->val;
979         size_t printed = scnprintf(bf, size, "%d", fd);
980         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981
982         if (path)
983                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984
985         return printed;
986 }
987
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989                                               struct syscall_arg *arg)
990 {
991         int fd = arg->val;
992         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993         struct thread_trace *ttrace = thread__priv(arg->thread);
994
995         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996                 zfree(&ttrace->paths.table[fd]);
997
998         return printed;
999 }
1000
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002                                      unsigned long ptr)
1003 {
1004         struct thread_trace *ttrace = thread__priv(thread);
1005
1006         ttrace->filename.ptr = ptr;
1007         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011                                               struct syscall_arg *arg)
1012 {
1013         unsigned long ptr = arg->val;
1014
1015         if (!arg->trace->vfs_getname)
1016                 return scnprintf(bf, size, "%#x", ptr);
1017
1018         thread__set_filename_pos(arg->thread, bf, ptr);
1019         return 0;
1020 }
1021
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024         return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026
1027 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030
1031         return fprintf(fp, "%10.3f ", ts);
1032 }
1033
1034 static bool done = false;
1035 static bool interrupted = false;
1036
1037 static void sig_handler(int sig)
1038 {
1039         done = true;
1040         interrupted = sig == SIGINT;
1041 }
1042
1043 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1044                                         u64 duration, u64 tstamp, FILE *fp)
1045 {
1046         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1047         printed += fprintf_duration(duration, fp);
1048
1049         if (trace->multiple_threads) {
1050                 if (trace->show_comm)
1051                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1052                 printed += fprintf(fp, "%d ", thread->tid);
1053         }
1054
1055         return printed;
1056 }
1057
1058 static int trace__process_event(struct trace *trace, struct machine *machine,
1059                                 union perf_event *event, struct perf_sample *sample)
1060 {
1061         int ret = 0;
1062
1063         switch (event->header.type) {
1064         case PERF_RECORD_LOST:
1065                 color_fprintf(trace->output, PERF_COLOR_RED,
1066                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1067                 ret = machine__process_lost_event(machine, event, sample);
1068                 break;
1069         default:
1070                 ret = machine__process_event(machine, event, sample);
1071                 break;
1072         }
1073
1074         return ret;
1075 }
1076
1077 static int trace__tool_process(struct perf_tool *tool,
1078                                union perf_event *event,
1079                                struct perf_sample *sample,
1080                                struct machine *machine)
1081 {
1082         struct trace *trace = container_of(tool, struct trace, tool);
1083         return trace__process_event(trace, machine, event, sample);
1084 }
1085
1086 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1087 {
1088         struct machine *machine = vmachine;
1089
1090         if (machine->kptr_restrict_warned)
1091                 return NULL;
1092
1093         if (symbol_conf.kptr_restrict) {
1094                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1095                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1096                            "Kernel samples will not be resolved.\n");
1097                 machine->kptr_restrict_warned = true;
1098                 return NULL;
1099         }
1100
1101         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1102 }
1103
1104 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1105 {
1106         int err = symbol__init(NULL);
1107
1108         if (err)
1109                 return err;
1110
1111         trace->host = machine__new_host();
1112         if (trace->host == NULL)
1113                 return -ENOMEM;
1114
1115         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1116                 return -errno;
1117
1118         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1119                                             evlist->threads, trace__tool_process, false,
1120                                             trace->opts.proc_map_timeout);
1121         if (err)
1122                 symbol__exit();
1123
1124         return err;
1125 }
1126
1127 static int syscall__set_arg_fmts(struct syscall *sc)
1128 {
1129         struct format_field *field;
1130         int idx = 0, len;
1131
1132         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1133         if (sc->arg_scnprintf == NULL)
1134                 return -1;
1135
1136         if (sc->fmt)
1137                 sc->arg_parm = sc->fmt->arg_parm;
1138
1139         for (field = sc->args; field; field = field->next) {
1140                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1141                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1142                 else if (strcmp(field->type, "const char *") == 0 &&
1143                          (strcmp(field->name, "filename") == 0 ||
1144                           strcmp(field->name, "path") == 0 ||
1145                           strcmp(field->name, "pathname") == 0))
1146                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1147                 else if (field->flags & FIELD_IS_POINTER)
1148                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1149                 else if (strcmp(field->type, "pid_t") == 0)
1150                         sc->arg_scnprintf[idx] = SCA_PID;
1151                 else if (strcmp(field->type, "umode_t") == 0)
1152                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1153                 else if ((strcmp(field->type, "int") == 0 ||
1154                           strcmp(field->type, "unsigned int") == 0 ||
1155                           strcmp(field->type, "long") == 0) &&
1156                          (len = strlen(field->name)) >= 2 &&
1157                          strcmp(field->name + len - 2, "fd") == 0) {
1158                         /*
1159                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1160                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1161                          * 65 int
1162                          * 23 unsigned int
1163                          * 7 unsigned long
1164                          */
1165                         sc->arg_scnprintf[idx] = SCA_FD;
1166                 }
1167                 ++idx;
1168         }
1169
1170         return 0;
1171 }
1172
1173 static int trace__read_syscall_info(struct trace *trace, int id)
1174 {
1175         char tp_name[128];
1176         struct syscall *sc;
1177         const char *name = syscalltbl__name(trace->sctbl, id);
1178
1179         if (name == NULL)
1180                 return -1;
1181
1182         if (id > trace->syscalls.max) {
1183                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1184
1185                 if (nsyscalls == NULL)
1186                         return -1;
1187
1188                 if (trace->syscalls.max != -1) {
1189                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1190                                (id - trace->syscalls.max) * sizeof(*sc));
1191                 } else {
1192                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1193                 }
1194
1195                 trace->syscalls.table = nsyscalls;
1196                 trace->syscalls.max   = id;
1197         }
1198
1199         sc = trace->syscalls.table + id;
1200         sc->name = name;
1201
1202         sc->fmt  = syscall_fmt__find(sc->name);
1203
1204         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1205         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1206
1207         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1208                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1209                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1210         }
1211
1212         if (IS_ERR(sc->tp_format))
1213                 return -1;
1214
1215         sc->args = sc->tp_format->format.fields;
1216         sc->nr_args = sc->tp_format->format.nr_fields;
1217         /*
1218          * We need to check and discard the first variable '__syscall_nr'
1219          * or 'nr' that mean the syscall number. It is needless here.
1220          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1221          */
1222         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1223                 sc->args = sc->args->next;
1224                 --sc->nr_args;
1225         }
1226
1227         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1228
1229         return syscall__set_arg_fmts(sc);
1230 }
1231
1232 static int trace__validate_ev_qualifier(struct trace *trace)
1233 {
1234         int err = 0, i;
1235         struct str_node *pos;
1236
1237         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1238         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1239                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1240
1241         if (trace->ev_qualifier_ids.entries == NULL) {
1242                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1243                        trace->output);
1244                 err = -EINVAL;
1245                 goto out;
1246         }
1247
1248         i = 0;
1249
1250         strlist__for_each(pos, trace->ev_qualifier) {
1251                 const char *sc = pos->s;
1252                 int id = syscalltbl__id(trace->sctbl, sc);
1253
1254                 if (id < 0) {
1255                         if (err == 0) {
1256                                 fputs("Error:\tInvalid syscall ", trace->output);
1257                                 err = -EINVAL;
1258                         } else {
1259                                 fputs(", ", trace->output);
1260                         }
1261
1262                         fputs(sc, trace->output);
1263                 }
1264
1265                 trace->ev_qualifier_ids.entries[i++] = id;
1266         }
1267
1268         if (err < 0) {
1269                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1270                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1271                 zfree(&trace->ev_qualifier_ids.entries);
1272                 trace->ev_qualifier_ids.nr = 0;
1273         }
1274 out:
1275         return err;
1276 }
1277
1278 /*
1279  * args is to be interpreted as a series of longs but we need to handle
1280  * 8-byte unaligned accesses. args points to raw_data within the event
1281  * and raw_data is guaranteed to be 8-byte unaligned because it is
1282  * preceded by raw_size which is a u32. So we need to copy args to a temp
1283  * variable to read it. Most notably this avoids extended load instructions
1284  * on unaligned addresses
1285  */
1286
1287 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1288                                       unsigned char *args, struct trace *trace,
1289                                       struct thread *thread)
1290 {
1291         size_t printed = 0;
1292         unsigned char *p;
1293         unsigned long val;
1294
1295         if (sc->args != NULL) {
1296                 struct format_field *field;
1297                 u8 bit = 1;
1298                 struct syscall_arg arg = {
1299                         .idx    = 0,
1300                         .mask   = 0,
1301                         .trace  = trace,
1302                         .thread = thread,
1303                 };
1304
1305                 for (field = sc->args; field;
1306                      field = field->next, ++arg.idx, bit <<= 1) {
1307                         if (arg.mask & bit)
1308                                 continue;
1309
1310                         /* special care for unaligned accesses */
1311                         p = args + sizeof(unsigned long) * arg.idx;
1312                         memcpy(&val, p, sizeof(val));
1313
1314                         /*
1315                          * Suppress this argument if its value is zero and
1316                          * and we don't have a string associated in an
1317                          * strarray for it.
1318                          */
1319                         if (val == 0 &&
1320                             !(sc->arg_scnprintf &&
1321                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1322                               sc->arg_parm[arg.idx]))
1323                                 continue;
1324
1325                         printed += scnprintf(bf + printed, size - printed,
1326                                              "%s%s: ", printed ? ", " : "", field->name);
1327                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1328                                 arg.val = val;
1329                                 if (sc->arg_parm)
1330                                         arg.parm = sc->arg_parm[arg.idx];
1331                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1332                                                                       size - printed, &arg);
1333                         } else {
1334                                 printed += scnprintf(bf + printed, size - printed,
1335                                                      "%ld", val);
1336                         }
1337                 }
1338         } else if (IS_ERR(sc->tp_format)) {
1339                 /*
1340                  * If we managed to read the tracepoint /format file, then we
1341                  * may end up not having any args, like with gettid(), so only
1342                  * print the raw args when we didn't manage to read it.
1343                  */
1344                 int i = 0;
1345
1346                 while (i < 6) {
1347                         /* special care for unaligned accesses */
1348                         p = args + sizeof(unsigned long) * i;
1349                         memcpy(&val, p, sizeof(val));
1350                         printed += scnprintf(bf + printed, size - printed,
1351                                              "%sarg%d: %ld",
1352                                              printed ? ", " : "", i, val);
1353                         ++i;
1354                 }
1355         }
1356
1357         return printed;
1358 }
1359
1360 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1361                                   union perf_event *event,
1362                                   struct perf_sample *sample);
1363
1364 static struct syscall *trace__syscall_info(struct trace *trace,
1365                                            struct perf_evsel *evsel, int id)
1366 {
1367
1368         if (id < 0) {
1369
1370                 /*
1371                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1372                  * before that, leaving at a higher verbosity level till that is
1373                  * explained. Reproduced with plain ftrace with:
1374                  *
1375                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1376                  * grep "NR -1 " /t/trace_pipe
1377                  *
1378                  * After generating some load on the machine.
1379                  */
1380                 if (verbose > 1) {
1381                         static u64 n;
1382                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1383                                 id, perf_evsel__name(evsel), ++n);
1384                 }
1385                 return NULL;
1386         }
1387
1388         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1389             trace__read_syscall_info(trace, id))
1390                 goto out_cant_read;
1391
1392         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1393                 goto out_cant_read;
1394
1395         return &trace->syscalls.table[id];
1396
1397 out_cant_read:
1398         if (verbose) {
1399                 fprintf(trace->output, "Problems reading syscall %d", id);
1400                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1401                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1402                 fputs(" information\n", trace->output);
1403         }
1404         return NULL;
1405 }
1406
1407 static void thread__update_stats(struct thread_trace *ttrace,
1408                                  int id, struct perf_sample *sample)
1409 {
1410         struct int_node *inode;
1411         struct stats *stats;
1412         u64 duration = 0;
1413
1414         inode = intlist__findnew(ttrace->syscall_stats, id);
1415         if (inode == NULL)
1416                 return;
1417
1418         stats = inode->priv;
1419         if (stats == NULL) {
1420                 stats = malloc(sizeof(struct stats));
1421                 if (stats == NULL)
1422                         return;
1423                 init_stats(stats);
1424                 inode->priv = stats;
1425         }
1426
1427         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1428                 duration = sample->time - ttrace->entry_time;
1429
1430         update_stats(stats, duration);
1431 }
1432
1433 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1434 {
1435         struct thread_trace *ttrace;
1436         u64 duration;
1437         size_t printed;
1438
1439         if (trace->current == NULL)
1440                 return 0;
1441
1442         ttrace = thread__priv(trace->current);
1443
1444         if (!ttrace->entry_pending)
1445                 return 0;
1446
1447         duration = sample->time - ttrace->entry_time;
1448
1449         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1450         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1451         ttrace->entry_pending = false;
1452
1453         return printed;
1454 }
1455
1456 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1457                             union perf_event *event __maybe_unused,
1458                             struct perf_sample *sample)
1459 {
1460         char *msg;
1461         void *args;
1462         size_t printed = 0;
1463         struct thread *thread;
1464         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1465         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1466         struct thread_trace *ttrace;
1467
1468         if (sc == NULL)
1469                 return -1;
1470
1471         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1472         ttrace = thread__trace(thread, trace->output);
1473         if (ttrace == NULL)
1474                 goto out_put;
1475
1476         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1477
1478         if (ttrace->entry_str == NULL) {
1479                 ttrace->entry_str = malloc(trace__entry_str_size);
1480                 if (!ttrace->entry_str)
1481                         goto out_put;
1482         }
1483
1484         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1485                 trace__printf_interrupted_entry(trace, sample);
1486
1487         ttrace->entry_time = sample->time;
1488         msg = ttrace->entry_str;
1489         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1490
1491         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1492                                            args, trace, thread);
1493
1494         if (sc->is_exit) {
1495                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1496                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1497                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1498                 }
1499         } else {
1500                 ttrace->entry_pending = true;
1501                 /* See trace__vfs_getname & trace__sys_exit */
1502                 ttrace->filename.pending_open = false;
1503         }
1504
1505         if (trace->current != thread) {
1506                 thread__put(trace->current);
1507                 trace->current = thread__get(thread);
1508         }
1509         err = 0;
1510 out_put:
1511         thread__put(thread);
1512         return err;
1513 }
1514
1515 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1516                                     struct perf_sample *sample,
1517                                     struct callchain_cursor *cursor)
1518 {
1519         struct addr_location al;
1520
1521         if (machine__resolve(trace->host, &al, sample) < 0 ||
1522             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1523                 return -1;
1524
1525         return 0;
1526 }
1527
1528 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1529 {
1530         /* TODO: user-configurable print_opts */
1531         const unsigned int print_opts = EVSEL__PRINT_SYM |
1532                                         EVSEL__PRINT_DSO |
1533                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1534
1535         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1536 }
1537
1538 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1539                            union perf_event *event __maybe_unused,
1540                            struct perf_sample *sample)
1541 {
1542         long ret;
1543         u64 duration = 0;
1544         struct thread *thread;
1545         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1546         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1547         struct thread_trace *ttrace;
1548
1549         if (sc == NULL)
1550                 return -1;
1551
1552         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1553         ttrace = thread__trace(thread, trace->output);
1554         if (ttrace == NULL)
1555                 goto out_put;
1556
1557         if (trace->summary)
1558                 thread__update_stats(ttrace, id, sample);
1559
1560         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1561
1562         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1563                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1564                 ttrace->filename.pending_open = false;
1565                 ++trace->stats.vfs_getname;
1566         }
1567
1568         ttrace->exit_time = sample->time;
1569
1570         if (ttrace->entry_time) {
1571                 duration = sample->time - ttrace->entry_time;
1572                 if (trace__filter_duration(trace, duration))
1573                         goto out;
1574         } else if (trace->duration_filter)
1575                 goto out;
1576
1577         if (sample->callchain) {
1578                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1579                 if (callchain_ret == 0) {
1580                         if (callchain_cursor.nr < trace->min_stack)
1581                                 goto out;
1582                         callchain_ret = 1;
1583                 }
1584         }
1585
1586         if (trace->summary_only)
1587                 goto out;
1588
1589         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1590
1591         if (ttrace->entry_pending) {
1592                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1593         } else {
1594                 fprintf(trace->output, " ... [");
1595                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1596                 fprintf(trace->output, "]: %s()", sc->name);
1597         }
1598
1599         if (sc->fmt == NULL) {
1600 signed_print:
1601                 fprintf(trace->output, ") = %ld", ret);
1602         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1603                 char bf[STRERR_BUFSIZE];
1604                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1605                            *e = audit_errno_to_name(-ret);
1606
1607                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1608         } else if (ret == 0 && sc->fmt->timeout)
1609                 fprintf(trace->output, ") = 0 Timeout");
1610         else if (sc->fmt->hexret)
1611                 fprintf(trace->output, ") = %#lx", ret);
1612         else if (sc->fmt->errpid) {
1613                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1614
1615                 if (child != NULL) {
1616                         fprintf(trace->output, ") = %ld", ret);
1617                         if (child->comm_set)
1618                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1619                         thread__put(child);
1620                 }
1621         } else
1622                 goto signed_print;
1623
1624         fputc('\n', trace->output);
1625
1626         if (callchain_ret > 0)
1627                 trace__fprintf_callchain(trace, sample);
1628         else if (callchain_ret < 0)
1629                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1630 out:
1631         ttrace->entry_pending = false;
1632         err = 0;
1633 out_put:
1634         thread__put(thread);
1635         return err;
1636 }
1637
1638 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1639                               union perf_event *event __maybe_unused,
1640                               struct perf_sample *sample)
1641 {
1642         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1643         struct thread_trace *ttrace;
1644         size_t filename_len, entry_str_len, to_move;
1645         ssize_t remaining_space;
1646         char *pos;
1647         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1648
1649         if (!thread)
1650                 goto out;
1651
1652         ttrace = thread__priv(thread);
1653         if (!ttrace)
1654                 goto out;
1655
1656         filename_len = strlen(filename);
1657
1658         if (ttrace->filename.namelen < filename_len) {
1659                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1660
1661                 if (f == NULL)
1662                                 goto out;
1663
1664                 ttrace->filename.namelen = filename_len;
1665                 ttrace->filename.name = f;
1666         }
1667
1668         strcpy(ttrace->filename.name, filename);
1669         ttrace->filename.pending_open = true;
1670
1671         if (!ttrace->filename.ptr)
1672                 goto out;
1673
1674         entry_str_len = strlen(ttrace->entry_str);
1675         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1676         if (remaining_space <= 0)
1677                 goto out;
1678
1679         if (filename_len > (size_t)remaining_space) {
1680                 filename += filename_len - remaining_space;
1681                 filename_len = remaining_space;
1682         }
1683
1684         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1685         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1686         memmove(pos + filename_len, pos, to_move);
1687         memcpy(pos, filename, filename_len);
1688
1689         ttrace->filename.ptr = 0;
1690         ttrace->filename.entry_str_pos = 0;
1691 out:
1692         return 0;
1693 }
1694
1695 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1696                                      union perf_event *event __maybe_unused,
1697                                      struct perf_sample *sample)
1698 {
1699         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1700         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1701         struct thread *thread = machine__findnew_thread(trace->host,
1702                                                         sample->pid,
1703                                                         sample->tid);
1704         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1705
1706         if (ttrace == NULL)
1707                 goto out_dump;
1708
1709         ttrace->runtime_ms += runtime_ms;
1710         trace->runtime_ms += runtime_ms;
1711         thread__put(thread);
1712         return 0;
1713
1714 out_dump:
1715         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1716                evsel->name,
1717                perf_evsel__strval(evsel, sample, "comm"),
1718                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1719                runtime,
1720                perf_evsel__intval(evsel, sample, "vruntime"));
1721         thread__put(thread);
1722         return 0;
1723 }
1724
1725 static void bpf_output__printer(enum binary_printer_ops op,
1726                                 unsigned int val, void *extra)
1727 {
1728         FILE *output = extra;
1729         unsigned char ch = (unsigned char)val;
1730
1731         switch (op) {
1732         case BINARY_PRINT_CHAR_DATA:
1733                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1734                 break;
1735         case BINARY_PRINT_DATA_BEGIN:
1736         case BINARY_PRINT_LINE_BEGIN:
1737         case BINARY_PRINT_ADDR:
1738         case BINARY_PRINT_NUM_DATA:
1739         case BINARY_PRINT_NUM_PAD:
1740         case BINARY_PRINT_SEP:
1741         case BINARY_PRINT_CHAR_PAD:
1742         case BINARY_PRINT_LINE_END:
1743         case BINARY_PRINT_DATA_END:
1744         default:
1745                 break;
1746         }
1747 }
1748
1749 static void bpf_output__fprintf(struct trace *trace,
1750                                 struct perf_sample *sample)
1751 {
1752         print_binary(sample->raw_data, sample->raw_size, 8,
1753                      bpf_output__printer, trace->output);
1754 }
1755
1756 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1757                                 union perf_event *event __maybe_unused,
1758                                 struct perf_sample *sample)
1759 {
1760         int callchain_ret = 0;
1761
1762         if (sample->callchain) {
1763                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1764                 if (callchain_ret == 0) {
1765                         if (callchain_cursor.nr < trace->min_stack)
1766                                 goto out;
1767                         callchain_ret = 1;
1768                 }
1769         }
1770
1771         trace__printf_interrupted_entry(trace, sample);
1772         trace__fprintf_tstamp(trace, sample->time, trace->output);
1773
1774         if (trace->trace_syscalls)
1775                 fprintf(trace->output, "(         ): ");
1776
1777         fprintf(trace->output, "%s:", evsel->name);
1778
1779         if (perf_evsel__is_bpf_output(evsel)) {
1780                 bpf_output__fprintf(trace, sample);
1781         } else if (evsel->tp_format) {
1782                 event_format__fprintf(evsel->tp_format, sample->cpu,
1783                                       sample->raw_data, sample->raw_size,
1784                                       trace->output);
1785         }
1786
1787         fprintf(trace->output, ")\n");
1788
1789         if (callchain_ret > 0)
1790                 trace__fprintf_callchain(trace, sample);
1791         else if (callchain_ret < 0)
1792                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1793 out:
1794         return 0;
1795 }
1796
1797 static void print_location(FILE *f, struct perf_sample *sample,
1798                            struct addr_location *al,
1799                            bool print_dso, bool print_sym)
1800 {
1801
1802         if ((verbose || print_dso) && al->map)
1803                 fprintf(f, "%s@", al->map->dso->long_name);
1804
1805         if ((verbose || print_sym) && al->sym)
1806                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1807                         al->addr - al->sym->start);
1808         else if (al->map)
1809                 fprintf(f, "0x%" PRIx64, al->addr);
1810         else
1811                 fprintf(f, "0x%" PRIx64, sample->addr);
1812 }
1813
1814 static int trace__pgfault(struct trace *trace,
1815                           struct perf_evsel *evsel,
1816                           union perf_event *event __maybe_unused,
1817                           struct perf_sample *sample)
1818 {
1819         struct thread *thread;
1820         struct addr_location al;
1821         char map_type = 'd';
1822         struct thread_trace *ttrace;
1823         int err = -1;
1824         int callchain_ret = 0;
1825
1826         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1827
1828         if (sample->callchain) {
1829                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1830                 if (callchain_ret == 0) {
1831                         if (callchain_cursor.nr < trace->min_stack)
1832                                 goto out_put;
1833                         callchain_ret = 1;
1834                 }
1835         }
1836
1837         ttrace = thread__trace(thread, trace->output);
1838         if (ttrace == NULL)
1839                 goto out_put;
1840
1841         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1842                 ttrace->pfmaj++;
1843         else
1844                 ttrace->pfmin++;
1845
1846         if (trace->summary_only)
1847                 goto out;
1848
1849         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1850                               sample->ip, &al);
1851
1852         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1853
1854         fprintf(trace->output, "%sfault [",
1855                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1856                 "maj" : "min");
1857
1858         print_location(trace->output, sample, &al, false, true);
1859
1860         fprintf(trace->output, "] => ");
1861
1862         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1863                                    sample->addr, &al);
1864
1865         if (!al.map) {
1866                 thread__find_addr_location(thread, sample->cpumode,
1867                                            MAP__FUNCTION, sample->addr, &al);
1868
1869                 if (al.map)
1870                         map_type = 'x';
1871                 else
1872                         map_type = '?';
1873         }
1874
1875         print_location(trace->output, sample, &al, true, false);
1876
1877         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1878
1879         if (callchain_ret > 0)
1880                 trace__fprintf_callchain(trace, sample);
1881         else if (callchain_ret < 0)
1882                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1883 out:
1884         err = 0;
1885 out_put:
1886         thread__put(thread);
1887         return err;
1888 }
1889
1890 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1891 {
1892         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1893             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1894                 return false;
1895
1896         if (trace->pid_list || trace->tid_list)
1897                 return true;
1898
1899         return false;
1900 }
1901
1902 static void trace__set_base_time(struct trace *trace,
1903                                  struct perf_evsel *evsel,
1904                                  struct perf_sample *sample)
1905 {
1906         /*
1907          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1908          * and don't use sample->time unconditionally, we may end up having
1909          * some other event in the future without PERF_SAMPLE_TIME for good
1910          * reason, i.e. we may not be interested in its timestamps, just in
1911          * it taking place, picking some piece of information when it
1912          * appears in our event stream (vfs_getname comes to mind).
1913          */
1914         if (trace->base_time == 0 && !trace->full_time &&
1915             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1916                 trace->base_time = sample->time;
1917 }
1918
1919 static int trace__process_sample(struct perf_tool *tool,
1920                                  union perf_event *event,
1921                                  struct perf_sample *sample,
1922                                  struct perf_evsel *evsel,
1923                                  struct machine *machine __maybe_unused)
1924 {
1925         struct trace *trace = container_of(tool, struct trace, tool);
1926         int err = 0;
1927
1928         tracepoint_handler handler = evsel->handler;
1929
1930         if (skip_sample(trace, sample))
1931                 return 0;
1932
1933         trace__set_base_time(trace, evsel, sample);
1934
1935         if (handler) {
1936                 ++trace->nr_events;
1937                 handler(trace, evsel, event, sample);
1938         }
1939
1940         return err;
1941 }
1942
1943 static int parse_target_str(struct trace *trace)
1944 {
1945         if (trace->opts.target.pid) {
1946                 trace->pid_list = intlist__new(trace->opts.target.pid);
1947                 if (trace->pid_list == NULL) {
1948                         pr_err("Error parsing process id string\n");
1949                         return -EINVAL;
1950                 }
1951         }
1952
1953         if (trace->opts.target.tid) {
1954                 trace->tid_list = intlist__new(trace->opts.target.tid);
1955                 if (trace->tid_list == NULL) {
1956                         pr_err("Error parsing thread id string\n");
1957                         return -EINVAL;
1958                 }
1959         }
1960
1961         return 0;
1962 }
1963
1964 static int trace__record(struct trace *trace, int argc, const char **argv)
1965 {
1966         unsigned int rec_argc, i, j;
1967         const char **rec_argv;
1968         const char * const record_args[] = {
1969                 "record",
1970                 "-R",
1971                 "-m", "1024",
1972                 "-c", "1",
1973         };
1974
1975         const char * const sc_args[] = { "-e", };
1976         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1977         const char * const majpf_args[] = { "-e", "major-faults" };
1978         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1979         const char * const minpf_args[] = { "-e", "minor-faults" };
1980         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1981
1982         /* +1 is for the event string below */
1983         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1984                 majpf_args_nr + minpf_args_nr + argc;
1985         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1986
1987         if (rec_argv == NULL)
1988                 return -ENOMEM;
1989
1990         j = 0;
1991         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1992                 rec_argv[j++] = record_args[i];
1993
1994         if (trace->trace_syscalls) {
1995                 for (i = 0; i < sc_args_nr; i++)
1996                         rec_argv[j++] = sc_args[i];
1997
1998                 /* event string may be different for older kernels - e.g., RHEL6 */
1999                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2000                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2001                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2002                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2003                 else {
2004                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2005                         return -1;
2006                 }
2007         }
2008
2009         if (trace->trace_pgfaults & TRACE_PFMAJ)
2010                 for (i = 0; i < majpf_args_nr; i++)
2011                         rec_argv[j++] = majpf_args[i];
2012
2013         if (trace->trace_pgfaults & TRACE_PFMIN)
2014                 for (i = 0; i < minpf_args_nr; i++)
2015                         rec_argv[j++] = minpf_args[i];
2016
2017         for (i = 0; i < (unsigned int)argc; i++)
2018                 rec_argv[j++] = argv[i];
2019
2020         return cmd_record(j, rec_argv, NULL);
2021 }
2022
2023 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2024
2025 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2026 {
2027         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2028
2029         if (IS_ERR(evsel))
2030                 return false;
2031
2032         if (perf_evsel__field(evsel, "pathname") == NULL) {
2033                 perf_evsel__delete(evsel);
2034                 return false;
2035         }
2036
2037         evsel->handler = trace__vfs_getname;
2038         perf_evlist__add(evlist, evsel);
2039         return true;
2040 }
2041
2042 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2043 {
2044         struct perf_evsel *evsel;
2045         struct perf_event_attr attr = {
2046                 .type = PERF_TYPE_SOFTWARE,
2047                 .mmap_data = 1,
2048         };
2049
2050         attr.config = config;
2051         attr.sample_period = 1;
2052
2053         event_attr_init(&attr);
2054
2055         evsel = perf_evsel__new(&attr);
2056         if (evsel)
2057                 evsel->handler = trace__pgfault;
2058
2059         return evsel;
2060 }
2061
2062 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2063 {
2064         const u32 type = event->header.type;
2065         struct perf_evsel *evsel;
2066
2067         if (type != PERF_RECORD_SAMPLE) {
2068                 trace__process_event(trace, trace->host, event, sample);
2069                 return;
2070         }
2071
2072         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2073         if (evsel == NULL) {
2074                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2075                 return;
2076         }
2077
2078         trace__set_base_time(trace, evsel, sample);
2079
2080         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2081             sample->raw_data == NULL) {
2082                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2083                        perf_evsel__name(evsel), sample->tid,
2084                        sample->cpu, sample->raw_size);
2085         } else {
2086                 tracepoint_handler handler = evsel->handler;
2087                 handler(trace, evsel, event, sample);
2088         }
2089 }
2090
2091 static int trace__add_syscall_newtp(struct trace *trace)
2092 {
2093         int ret = -1;
2094         struct perf_evlist *evlist = trace->evlist;
2095         struct perf_evsel *sys_enter, *sys_exit;
2096
2097         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2098         if (sys_enter == NULL)
2099                 goto out;
2100
2101         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2102                 goto out_delete_sys_enter;
2103
2104         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2105         if (sys_exit == NULL)
2106                 goto out_delete_sys_enter;
2107
2108         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2109                 goto out_delete_sys_exit;
2110
2111         perf_evlist__add(evlist, sys_enter);
2112         perf_evlist__add(evlist, sys_exit);
2113
2114         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2115                 /*
2116                  * We're interested only in the user space callchain
2117                  * leading to the syscall, allow overriding that for
2118                  * debugging reasons using --kernel_syscall_callchains
2119                  */
2120                 sys_exit->attr.exclude_callchain_kernel = 1;
2121         }
2122
2123         trace->syscalls.events.sys_enter = sys_enter;
2124         trace->syscalls.events.sys_exit  = sys_exit;
2125
2126         ret = 0;
2127 out:
2128         return ret;
2129
2130 out_delete_sys_exit:
2131         perf_evsel__delete_priv(sys_exit);
2132 out_delete_sys_enter:
2133         perf_evsel__delete_priv(sys_enter);
2134         goto out;
2135 }
2136
2137 static int trace__set_ev_qualifier_filter(struct trace *trace)
2138 {
2139         int err = -1;
2140         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2141                                                 trace->ev_qualifier_ids.nr,
2142                                                 trace->ev_qualifier_ids.entries);
2143
2144         if (filter == NULL)
2145                 goto out_enomem;
2146
2147         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2148                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2149
2150         free(filter);
2151 out:
2152         return err;
2153 out_enomem:
2154         errno = ENOMEM;
2155         goto out;
2156 }
2157
2158 static int trace__run(struct trace *trace, int argc, const char **argv)
2159 {
2160         struct perf_evlist *evlist = trace->evlist;
2161         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2162         int err = -1, i;
2163         unsigned long before;
2164         const bool forks = argc > 0;
2165         bool draining = false;
2166
2167         trace->live = true;
2168
2169         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2170                 goto out_error_raw_syscalls;
2171
2172         if (trace->trace_syscalls)
2173                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2174
2175         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2176                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2177                 if (pgfault_maj == NULL)
2178                         goto out_error_mem;
2179                 perf_evlist__add(evlist, pgfault_maj);
2180         }
2181
2182         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2183                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2184                 if (pgfault_min == NULL)
2185                         goto out_error_mem;
2186                 perf_evlist__add(evlist, pgfault_min);
2187         }
2188
2189         if (trace->sched &&
2190             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2191                                    trace__sched_stat_runtime))
2192                 goto out_error_sched_stat_runtime;
2193
2194         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2195         if (err < 0) {
2196                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2197                 goto out_delete_evlist;
2198         }
2199
2200         err = trace__symbols_init(trace, evlist);
2201         if (err < 0) {
2202                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2203                 goto out_delete_evlist;
2204         }
2205
2206         perf_evlist__config(evlist, &trace->opts, NULL);
2207
2208         if (callchain_param.enabled) {
2209                 bool use_identifier = false;
2210
2211                 if (trace->syscalls.events.sys_exit) {
2212                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2213                                                      &trace->opts, &callchain_param);
2214                         use_identifier = true;
2215                 }
2216
2217                 if (pgfault_maj) {
2218                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2219                         use_identifier = true;
2220                 }
2221
2222                 if (pgfault_min) {
2223                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2224                         use_identifier = true;
2225                 }
2226
2227                 if (use_identifier) {
2228                        /*
2229                         * Now we have evsels with different sample_ids, use
2230                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2231                         * from a fixed position in each ring buffer record.
2232                         *
2233                         * As of this the changeset introducing this comment, this
2234                         * isn't strictly needed, as the fields that can come before
2235                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2236                         * some of those for things like copying the payload of
2237                         * pointer syscall arguments, and for vfs_getname we don't
2238                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2239                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2240                         */
2241                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2242                         perf_evlist__reset_sample_bit(evlist, ID);
2243                 }
2244         }
2245
2246         signal(SIGCHLD, sig_handler);
2247         signal(SIGINT, sig_handler);
2248
2249         if (forks) {
2250                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2251                                                     argv, false, NULL);
2252                 if (err < 0) {
2253                         fprintf(trace->output, "Couldn't run the workload!\n");
2254                         goto out_delete_evlist;
2255                 }
2256         }
2257
2258         err = perf_evlist__open(evlist);
2259         if (err < 0)
2260                 goto out_error_open;
2261
2262         err = bpf__apply_obj_config();
2263         if (err) {
2264                 char errbuf[BUFSIZ];
2265
2266                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2267                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2268                          errbuf);
2269                 goto out_error_open;
2270         }
2271
2272         /*
2273          * Better not use !target__has_task() here because we need to cover the
2274          * case where no threads were specified in the command line, but a
2275          * workload was, and in that case we will fill in the thread_map when
2276          * we fork the workload in perf_evlist__prepare_workload.
2277          */
2278         if (trace->filter_pids.nr > 0)
2279                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2280         else if (thread_map__pid(evlist->threads, 0) == -1)
2281                 err = perf_evlist__set_filter_pid(evlist, getpid());
2282
2283         if (err < 0)
2284                 goto out_error_mem;
2285
2286         if (trace->ev_qualifier_ids.nr > 0) {
2287                 err = trace__set_ev_qualifier_filter(trace);
2288                 if (err < 0)
2289                         goto out_errno;
2290
2291                 pr_debug("event qualifier tracepoint filter: %s\n",
2292                          trace->syscalls.events.sys_exit->filter);
2293         }
2294
2295         err = perf_evlist__apply_filters(evlist, &evsel);
2296         if (err < 0)
2297                 goto out_error_apply_filters;
2298
2299         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2300         if (err < 0)
2301                 goto out_error_mmap;
2302
2303         if (!target__none(&trace->opts.target))
2304                 perf_evlist__enable(evlist);
2305
2306         if (forks)
2307                 perf_evlist__start_workload(evlist);
2308
2309         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2310                                   evlist->threads->nr > 1 ||
2311                                   perf_evlist__first(evlist)->attr.inherit;
2312 again:
2313         before = trace->nr_events;
2314
2315         for (i = 0; i < evlist->nr_mmaps; i++) {
2316                 union perf_event *event;
2317
2318                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2319                         struct perf_sample sample;
2320
2321                         ++trace->nr_events;
2322
2323                         err = perf_evlist__parse_sample(evlist, event, &sample);
2324                         if (err) {
2325                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2326                                 goto next_event;
2327                         }
2328
2329                         trace__handle_event(trace, event, &sample);
2330 next_event:
2331                         perf_evlist__mmap_consume(evlist, i);
2332
2333                         if (interrupted)
2334                                 goto out_disable;
2335
2336                         if (done && !draining) {
2337                                 perf_evlist__disable(evlist);
2338                                 draining = true;
2339                         }
2340                 }
2341         }
2342
2343         if (trace->nr_events == before) {
2344                 int timeout = done ? 100 : -1;
2345
2346                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2347                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2348                                 draining = true;
2349
2350                         goto again;
2351                 }
2352         } else {
2353                 goto again;
2354         }
2355
2356 out_disable:
2357         thread__zput(trace->current);
2358
2359         perf_evlist__disable(evlist);
2360
2361         if (!err) {
2362                 if (trace->summary)
2363                         trace__fprintf_thread_summary(trace, trace->output);
2364
2365                 if (trace->show_tool_stats) {
2366                         fprintf(trace->output, "Stats:\n "
2367                                                " vfs_getname : %" PRIu64 "\n"
2368                                                " proc_getname: %" PRIu64 "\n",
2369                                 trace->stats.vfs_getname,
2370                                 trace->stats.proc_getname);
2371                 }
2372         }
2373
2374 out_delete_evlist:
2375         perf_evlist__delete(evlist);
2376         trace->evlist = NULL;
2377         trace->live = false;
2378         return err;
2379 {
2380         char errbuf[BUFSIZ];
2381
2382 out_error_sched_stat_runtime:
2383         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2384         goto out_error;
2385
2386 out_error_raw_syscalls:
2387         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2388         goto out_error;
2389
2390 out_error_mmap:
2391         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2392         goto out_error;
2393
2394 out_error_open:
2395         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2396
2397 out_error:
2398         fprintf(trace->output, "%s\n", errbuf);
2399         goto out_delete_evlist;
2400
2401 out_error_apply_filters:
2402         fprintf(trace->output,
2403                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2404                 evsel->filter, perf_evsel__name(evsel), errno,
2405                 strerror_r(errno, errbuf, sizeof(errbuf)));
2406         goto out_delete_evlist;
2407 }
2408 out_error_mem:
2409         fprintf(trace->output, "Not enough memory to run!\n");
2410         goto out_delete_evlist;
2411
2412 out_errno:
2413         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2414         goto out_delete_evlist;
2415 }
2416
2417 static int trace__replay(struct trace *trace)
2418 {
2419         const struct perf_evsel_str_handler handlers[] = {
2420                 { "probe:vfs_getname",       trace__vfs_getname, },
2421         };
2422         struct perf_data_file file = {
2423                 .path  = input_name,
2424                 .mode  = PERF_DATA_MODE_READ,
2425                 .force = trace->force,
2426         };
2427         struct perf_session *session;
2428         struct perf_evsel *evsel;
2429         int err = -1;
2430
2431         trace->tool.sample        = trace__process_sample;
2432         trace->tool.mmap          = perf_event__process_mmap;
2433         trace->tool.mmap2         = perf_event__process_mmap2;
2434         trace->tool.comm          = perf_event__process_comm;
2435         trace->tool.exit          = perf_event__process_exit;
2436         trace->tool.fork          = perf_event__process_fork;
2437         trace->tool.attr          = perf_event__process_attr;
2438         trace->tool.tracing_data = perf_event__process_tracing_data;
2439         trace->tool.build_id      = perf_event__process_build_id;
2440
2441         trace->tool.ordered_events = true;
2442         trace->tool.ordering_requires_timestamps = true;
2443
2444         /* add tid to output */
2445         trace->multiple_threads = true;
2446
2447         session = perf_session__new(&file, false, &trace->tool);
2448         if (session == NULL)
2449                 return -1;
2450
2451         if (symbol__init(&session->header.env) < 0)
2452                 goto out;
2453
2454         trace->host = &session->machines.host;
2455
2456         err = perf_session__set_tracepoints_handlers(session, handlers);
2457         if (err)
2458                 goto out;
2459
2460         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2461                                                      "raw_syscalls:sys_enter");
2462         /* older kernels have syscalls tp versus raw_syscalls */
2463         if (evsel == NULL)
2464                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2465                                                              "syscalls:sys_enter");
2466
2467         if (evsel &&
2468             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2469             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2470                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2471                 goto out;
2472         }
2473
2474         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475                                                      "raw_syscalls:sys_exit");
2476         if (evsel == NULL)
2477                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478                                                              "syscalls:sys_exit");
2479         if (evsel &&
2480             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2481             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2482                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2483                 goto out;
2484         }
2485
2486         evlist__for_each(session->evlist, evsel) {
2487                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2488                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2489                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2490                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2491                         evsel->handler = trace__pgfault;
2492         }
2493
2494         err = parse_target_str(trace);
2495         if (err != 0)
2496                 goto out;
2497
2498         setup_pager();
2499
2500         err = perf_session__process_events(session);
2501         if (err)
2502                 pr_err("Failed to process events, error %d", err);
2503
2504         else if (trace->summary)
2505                 trace__fprintf_thread_summary(trace, trace->output);
2506
2507 out:
2508         perf_session__delete(session);
2509
2510         return err;
2511 }
2512
2513 static size_t trace__fprintf_threads_header(FILE *fp)
2514 {
2515         size_t printed;
2516
2517         printed  = fprintf(fp, "\n Summary of events:\n\n");
2518
2519         return printed;
2520 }
2521
2522 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2523         struct stats    *stats;
2524         double          msecs;
2525         int             syscall;
2526 )
2527 {
2528         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2529         struct stats *stats = source->priv;
2530
2531         entry->syscall = source->i;
2532         entry->stats   = stats;
2533         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2534 }
2535
2536 static size_t thread__dump_stats(struct thread_trace *ttrace,
2537                                  struct trace *trace, FILE *fp)
2538 {
2539         size_t printed = 0;
2540         struct syscall *sc;
2541         struct rb_node *nd;
2542         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2543
2544         if (syscall_stats == NULL)
2545                 return 0;
2546
2547         printed += fprintf(fp, "\n");
2548
2549         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2550         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2551         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2552
2553         resort_rb__for_each(nd, syscall_stats) {
2554                 struct stats *stats = syscall_stats_entry->stats;
2555                 if (stats) {
2556                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2557                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2558                         double avg = avg_stats(stats);
2559                         double pct;
2560                         u64 n = (u64) stats->n;
2561
2562                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2563                         avg /= NSEC_PER_MSEC;
2564
2565                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2566                         printed += fprintf(fp, "   %-15s", sc->name);
2567                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2568                                            n, syscall_stats_entry->msecs, min, avg);
2569                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2570                 }
2571         }
2572
2573         resort_rb__delete(syscall_stats);
2574         printed += fprintf(fp, "\n\n");
2575
2576         return printed;
2577 }
2578
2579 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2580 {
2581         size_t printed = 0;
2582         struct thread_trace *ttrace = thread__priv(thread);
2583         double ratio;
2584
2585         if (ttrace == NULL)
2586                 return 0;
2587
2588         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2589
2590         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2591         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2592         printed += fprintf(fp, "%.1f%%", ratio);
2593         if (ttrace->pfmaj)
2594                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2595         if (ttrace->pfmin)
2596                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2597         if (trace->sched)
2598                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2599         else if (fputc('\n', fp) != EOF)
2600                 ++printed;
2601
2602         printed += thread__dump_stats(ttrace, trace, fp);
2603
2604         return printed;
2605 }
2606
2607 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2608 {
2609         return ttrace ? ttrace->nr_events : 0;
2610 }
2611
2612 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2613         struct thread *thread;
2614 )
2615 {
2616         entry->thread = rb_entry(nd, struct thread, rb_node);
2617 }
2618
2619 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2620 {
2621         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2622         size_t printed = trace__fprintf_threads_header(fp);
2623         struct rb_node *nd;
2624
2625         if (threads == NULL) {
2626                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2627                 return 0;
2628         }
2629
2630         resort_rb__for_each(nd, threads)
2631                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2632
2633         resort_rb__delete(threads);
2634
2635         return printed;
2636 }
2637
2638 static int trace__set_duration(const struct option *opt, const char *str,
2639                                int unset __maybe_unused)
2640 {
2641         struct trace *trace = opt->value;
2642
2643         trace->duration_filter = atof(str);
2644         return 0;
2645 }
2646
2647 static int trace__set_filter_pids(const struct option *opt, const char *str,
2648                                   int unset __maybe_unused)
2649 {
2650         int ret = -1;
2651         size_t i;
2652         struct trace *trace = opt->value;
2653         /*
2654          * FIXME: introduce a intarray class, plain parse csv and create a
2655          * { int nr, int entries[] } struct...
2656          */
2657         struct intlist *list = intlist__new(str);
2658
2659         if (list == NULL)
2660                 return -1;
2661
2662         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2663         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2664
2665         if (trace->filter_pids.entries == NULL)
2666                 goto out;
2667
2668         trace->filter_pids.entries[0] = getpid();
2669
2670         for (i = 1; i < trace->filter_pids.nr; ++i)
2671                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2672
2673         intlist__delete(list);
2674         ret = 0;
2675 out:
2676         return ret;
2677 }
2678
2679 static int trace__open_output(struct trace *trace, const char *filename)
2680 {
2681         struct stat st;
2682
2683         if (!stat(filename, &st) && st.st_size) {
2684                 char oldname[PATH_MAX];
2685
2686                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2687                 unlink(oldname);
2688                 rename(filename, oldname);
2689         }
2690
2691         trace->output = fopen(filename, "w");
2692
2693         return trace->output == NULL ? -errno : 0;
2694 }
2695
2696 static int parse_pagefaults(const struct option *opt, const char *str,
2697                             int unset __maybe_unused)
2698 {
2699         int *trace_pgfaults = opt->value;
2700
2701         if (strcmp(str, "all") == 0)
2702                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2703         else if (strcmp(str, "maj") == 0)
2704                 *trace_pgfaults |= TRACE_PFMAJ;
2705         else if (strcmp(str, "min") == 0)
2706                 *trace_pgfaults |= TRACE_PFMIN;
2707         else
2708                 return -1;
2709
2710         return 0;
2711 }
2712
2713 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2714 {
2715         struct perf_evsel *evsel;
2716
2717         evlist__for_each(evlist, evsel)
2718                 evsel->handler = handler;
2719 }
2720
2721 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2722 {
2723         const char *trace_usage[] = {
2724                 "perf trace [<options>] [<command>]",
2725                 "perf trace [<options>] -- <command> [<options>]",
2726                 "perf trace record [<options>] [<command>]",
2727                 "perf trace record [<options>] -- <command> [<options>]",
2728                 NULL
2729         };
2730         struct trace trace = {
2731                 .syscalls = {
2732                         . max = -1,
2733                 },
2734                 .opts = {
2735                         .target = {
2736                                 .uid       = UINT_MAX,
2737                                 .uses_mmap = true,
2738                         },
2739                         .user_freq     = UINT_MAX,
2740                         .user_interval = ULLONG_MAX,
2741                         .no_buffering  = true,
2742                         .mmap_pages    = UINT_MAX,
2743                         .proc_map_timeout  = 500,
2744                 },
2745                 .output = stderr,
2746                 .show_comm = true,
2747                 .trace_syscalls = true,
2748                 .kernel_syscallchains = false,
2749                 .max_stack = UINT_MAX,
2750         };
2751         const char *output_name = NULL;
2752         const char *ev_qualifier_str = NULL;
2753         const struct option trace_options[] = {
2754         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2755                      "event selector. use 'perf list' to list available events",
2756                      parse_events_option),
2757         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2758                     "show the thread COMM next to its id"),
2759         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2760         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2761         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2762         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2763         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2764                     "trace events on existing process id"),
2765         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2766                     "trace events on existing thread id"),
2767         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2768                      "pids to filter (by the kernel)", trace__set_filter_pids),
2769         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2770                     "system-wide collection from all CPUs"),
2771         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2772                     "list of cpus to monitor"),
2773         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2774                     "child tasks do not inherit counters"),
2775         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2776                      "number of mmap data pages",
2777                      perf_evlist__parse_mmap_pages),
2778         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2779                    "user to profile"),
2780         OPT_CALLBACK(0, "duration", &trace, "float",
2781                      "show only events with duration > N.M ms",
2782                      trace__set_duration),
2783         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2784         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2785         OPT_BOOLEAN('T', "time", &trace.full_time,
2786                     "Show full timestamp, not time relative to first start"),
2787         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2788                     "Show only syscall summary with statistics"),
2789         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2790                     "Show all syscalls and summary with statistics"),
2791         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2792                      "Trace pagefaults", parse_pagefaults, "maj"),
2793         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2794         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2795         OPT_CALLBACK(0, "call-graph", &trace.opts,
2796                      "record_mode[,record_size]", record_callchain_help,
2797                      &record_parse_callchain_opt),
2798         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2799                     "Show the kernel callchains on the syscall exit path"),
2800         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2801                      "Set the minimum stack depth when parsing the callchain, "
2802                      "anything below the specified depth will be ignored."),
2803         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2804                      "Set the maximum stack depth when parsing the callchain, "
2805                      "anything beyond the specified depth will be ignored. "
2806                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2807         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2808                         "per thread proc mmap processing timeout in ms"),
2809         OPT_END()
2810         };
2811         bool __maybe_unused max_stack_user_set = true;
2812         bool mmap_pages_user_set = true;
2813         const char * const trace_subcommands[] = { "record", NULL };
2814         int err;
2815         char bf[BUFSIZ];
2816
2817         signal(SIGSEGV, sighandler_dump_stack);
2818         signal(SIGFPE, sighandler_dump_stack);
2819
2820         trace.evlist = perf_evlist__new();
2821         trace.sctbl = syscalltbl__new();
2822
2823         if (trace.evlist == NULL || trace.sctbl == NULL) {
2824                 pr_err("Not enough memory to run!\n");
2825                 err = -ENOMEM;
2826                 goto out;
2827         }
2828
2829         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2830                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2831
2832         err = bpf__setup_stdout(trace.evlist);
2833         if (err) {
2834                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2835                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2836                 goto out;
2837         }
2838
2839         err = -1;
2840
2841         if (trace.trace_pgfaults) {
2842                 trace.opts.sample_address = true;
2843                 trace.opts.sample_time = true;
2844         }
2845
2846         if (trace.opts.mmap_pages == UINT_MAX)
2847                 mmap_pages_user_set = false;
2848
2849         if (trace.max_stack == UINT_MAX) {
2850                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2851                 max_stack_user_set = false;
2852         }
2853
2854 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2855         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2856                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2857 #endif
2858
2859         if (callchain_param.enabled) {
2860                 if (!mmap_pages_user_set && geteuid() == 0)
2861                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2862
2863                 symbol_conf.use_callchain = true;
2864         }
2865
2866         if (trace.evlist->nr_entries > 0)
2867                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2868
2869         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2870                 return trace__record(&trace, argc-1, &argv[1]);
2871
2872         /* summary_only implies summary option, but don't overwrite summary if set */
2873         if (trace.summary_only)
2874                 trace.summary = trace.summary_only;
2875
2876         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2877             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2878                 pr_err("Please specify something to trace.\n");
2879                 return -1;
2880         }
2881
2882         if (!trace.trace_syscalls && ev_qualifier_str) {
2883                 pr_err("The -e option can't be used with --no-syscalls.\n");
2884                 goto out;
2885         }
2886
2887         if (output_name != NULL) {
2888                 err = trace__open_output(&trace, output_name);
2889                 if (err < 0) {
2890                         perror("failed to create output file");
2891                         goto out;
2892                 }
2893         }
2894
2895         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2896
2897         if (ev_qualifier_str != NULL) {
2898                 const char *s = ev_qualifier_str;
2899                 struct strlist_config slist_config = {
2900                         .dirname = system_path(STRACE_GROUPS_DIR),
2901                 };
2902
2903                 trace.not_ev_qualifier = *s == '!';
2904                 if (trace.not_ev_qualifier)
2905                         ++s;
2906                 trace.ev_qualifier = strlist__new(s, &slist_config);
2907                 if (trace.ev_qualifier == NULL) {
2908                         fputs("Not enough memory to parse event qualifier",
2909                               trace.output);
2910                         err = -ENOMEM;
2911                         goto out_close;
2912                 }
2913
2914                 err = trace__validate_ev_qualifier(&trace);
2915                 if (err)
2916                         goto out_close;
2917         }
2918
2919         err = target__validate(&trace.opts.target);
2920         if (err) {
2921                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2922                 fprintf(trace.output, "%s", bf);
2923                 goto out_close;
2924         }
2925
2926         err = target__parse_uid(&trace.opts.target);
2927         if (err) {
2928                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2929                 fprintf(trace.output, "%s", bf);
2930                 goto out_close;
2931         }
2932
2933         if (!argc && target__none(&trace.opts.target))
2934                 trace.opts.target.system_wide = true;
2935
2936         if (input_name)
2937                 err = trace__replay(&trace);
2938         else
2939                 err = trace__run(&trace, argc, argv);
2940
2941 out_close:
2942         if (output_name != NULL)
2943                 fclose(trace.output);
2944 out:
2945         return err;
2946 }