utrace-based ftrace "process" engine, v2 This is v2 of the prototype utrace-ftrace interface. This code is based on Roland McGrath's utrace API, which provides programmatic hooks to the in-tree tracehook layer. This new patch interfaces many of those events to ftrace, as configured by a small number of debugfs controls. Here's the /debugfs/tracing/process_trace_README: process event tracer mini-HOWTO 1. Select process hierarchy to monitor. Other processes will be completely unaffected. Leave at 0 for system-wide tracing. % echo NNN > process_follow_pid 2. Determine which process event traces are potentially desired. syscall and signal tracing slow down monitored processes. % echo 0 > process_trace_{syscalls,signals,lifecycle} 3. Add any final uid- or taskcomm-based filtering. Non-matching processes will skip trace messages, but will still be slowed. % echo NNN > process_trace_uid_filter # -1: unrestricted % echo ls > process_trace_taskcomm_filter # empty: unrestricted 4. Start tracing. % echo process > current_tracer 5. Examine trace. % cat trace 6. Stop tracing. % echo nop > current_tracer Signed-off-by: Frank Ch. Eigler --- include/linux/processtrace.h | 41 +++ kernel/trace/Kconfig | 9 + kernel/trace/Makefile | 1 + kernel/trace/trace.h | 8 + kernel/trace/trace_process.c | 601 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 660 insertions(+), 0 deletions(-) diff --git a/include/linux/processtrace.h b/include/linux/processtrace.h new file mode 100644 index ...f2b7d94 100644 --- /dev/null +++ b/include/linux/processtrace.h @@ -0,0 +1,41 @@ +#ifndef PROCESSTRACE_H +#define PROCESSTRACE_H + +#include +#include + +struct process_trace_entry { + unsigned char opcode; /* one of _UTRACE_EVENT_* */ + char comm[TASK_COMM_LEN]; /* XXX: should be in/via trace_entry */ + union { + struct { + pid_t child; + unsigned long flags; + } trace_clone; + struct { + long code; + } trace_exit; + struct { + } trace_exec; + struct { + int si_signo; + int si_errno; + int si_code; + } trace_signal; + struct { + long callno; + unsigned long args[6]; + } trace_syscall_entry; + struct { + long rc; + long error; + } trace_syscall_exit; + }; +}; + +/* in kernel/trace/trace_process.c */ + +extern void enable_process_trace(void); +extern void disable_process_trace(void); + +#endif /* PROCESSTRACE_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 34e707e..8a92d6f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -150,6 +150,15 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. +config PROCESS_TRACER + bool "Trace process events via utrace" + depends on DEBUG_KERNEL + select TRACING + select UTRACE + help + This tracer provides trace records from process events + accessible to utrace: lifecycle, system calls, and signals. + config BOOT_TRACER bool "Trace boot initcalls" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 349d5a9..a774db2 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -33,5 +33,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += t obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o +obj-$(CONFIG_PROCESS_TRACER) += trace_process.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4d3d381..c4d2e7f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ enum trace_type { TRACE_USER_STACK, TRACE_HW_BRANCHES, TRACE_POWER, + TRACE_PROCESS, __TRACE_LAST_TYPE }; @@ -170,6 +172,11 @@ struct trace_power { struct power_trace state_data; }; +struct trace_process { + struct trace_entry ent; + struct process_trace_entry event; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -280,6 +287,7 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ + IF_ASSIGN(var, ent, struct trace_process, TRACE_PROCESS); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_process.c b/kernel/trace/trace_process.c new file mode 100644 index ...0820e56 100644 --- /dev/null +++ b/kernel/trace/trace_process.c @@ -0,0 +1,601 @@ +/* + * utrace-based process event tracing + * Copyright (C) 2009 Red Hat Inc. + * By Frank Ch. Eigler + * + * Based on mmio ftrace engine by Pekka Paalanen + * and utrace-syscall-tracing prototype by Ananth Mavinakayanahalli + */ + +/* #define DEBUG 1 */ + +#include +#include +#include +#include +#include + +#include "trace.h" + +/* A process must match these filters in order to be traced. */ +static char trace_taskcomm_filter[TASK_COMM_LEN]; /* \0: unrestricted */ +static u32 trace_taskuid_filter = -1; /* -1: unrestricted */ +static u32 trace_lifecycle_p = 1; +static u32 trace_syscalls_p = 1; +static u32 trace_signals_p = 1; + +/* A process must be a direct child of given pid in order to be + followed. */ +static u32 process_follow_pid; /* 0: unrestricted/systemwide */ + +/* XXX: lock the above? */ + + +/* trace data collection */ + +static struct trace_array *process_trace_array; + +static void process_reset_data(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + tracing_reset_online_cpus(tr); +} + +static int process_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + process_trace_array = tr; + process_reset_data(tr); + enable_process_trace(); + return 0; +} + +static void process_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + disable_process_trace(); + process_reset_data(tr); + process_trace_array = NULL; +} + +static void process_trace_start(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + process_reset_data(tr); +} + +static void __trace_processtrace(struct trace_array *tr, + struct trace_array_cpu *data, + struct process_trace_entry *ent) +{ + struct ring_buffer_event *event; + struct trace_process *entry; + unsigned long irq_flags; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, preempt_count()); + entry->ent.cpu = raw_smp_processor_id(); + entry->ent.type = TRACE_PROCESS; + strlcpy(ent->comm, current->comm, TASK_COMM_LEN); + entry->event = *ent; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} + +void process_trace(struct process_trace_entry *ent) +{ + struct trace_array *tr = process_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_processtrace(tr, data, ent); + preempt_enable(); +} + + +/* trace data rendering */ + +static void process_pipe_open(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + pr_debug("in %s\n", __func__); + trace_seq_printf(s, "VERSION 200901\n"); +} + +static void process_close(struct trace_iterator *iter) +{ + iter->private = NULL; +} + +static ssize_t process_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct trace_seq *s = &iter->seq; + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; +} + +static enum print_line_t process_print(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_process *field; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + trace_assign_type(field, entry); + + /* XXX: If print_lat_fmt() were not static, we wouldn't have + to duplicate this. */ + trace_seq_printf(s, "%16s %5d %3d %9lu.%06ld ", + field->event.comm, + entry->pid, entry->cpu, + secs, + usec_rem); + + switch (field->event.opcode) { + case _UTRACE_EVENT_CLONE: + ret = trace_seq_printf(s, "fork %d flags 0x%lx\n", + field->event.trace_clone.child, + field->event.trace_clone.flags); + break; + case _UTRACE_EVENT_EXEC: + ret = trace_seq_printf(s, "exec\n"); + break; + case _UTRACE_EVENT_EXIT: + ret = trace_seq_printf(s, "exit %ld\n", + field->event.trace_exit.code); + break; + case _UTRACE_EVENT_SIGNAL: + ret = trace_seq_printf(s, "signal %d errno %d code 0x%x\n", + field->event.trace_signal.si_signo, + field->event.trace_signal.si_errno, + field->event.trace_signal.si_code); + break; + case _UTRACE_EVENT_SYSCALL_ENTRY: + ret = trace_seq_printf(s, "syscall %ld [0x%lx 0x%lx 0x%lx" + " 0x%lx 0x%lx 0x%lx]\n", + field->event.trace_syscall_entry.callno, + field->event.trace_syscall_entry.args[0], + field->event.trace_syscall_entry.args[1], + field->event.trace_syscall_entry.args[2], + field->event.trace_syscall_entry.args[3], + field->event.trace_syscall_entry.args[4], + field->event.trace_syscall_entry.args[5]); + break; + case _UTRACE_EVENT_SYSCALL_EXIT: + ret = trace_seq_printf(s, "syscall rc %ld error %ld\n", + field->event.trace_syscall_exit.rc, + field->event.trace_syscall_exit.error); + break; + default: + ret = trace_seq_printf(s, "process code %d?\n", + field->event.opcode); + break; + } + if (ret) + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_HANDLED; +} + + +static enum print_line_t process_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_PROCESS: + return process_print(iter); + default: + return TRACE_TYPE_HANDLED; /* ignore unknown entries */ + } +} + +static struct tracer process_tracer = { + .name = "process", + .init = process_trace_init, + .reset = process_trace_reset, + .start = process_trace_start, + .pipe_open = process_pipe_open, + .close = process_close, + .read = process_read, + .print_line = process_print_line, +}; + + + +/* utrace backend */ + +/* Should tracing apply to given task? Compare against filter + values. */ +static int trace_test(struct task_struct *tsk) +{ + if (trace_taskcomm_filter[0] + && strncmp(trace_taskcomm_filter, tsk->comm, TASK_COMM_LEN)) + return 0; + + if (trace_taskuid_filter != (u32)-1 + && trace_taskuid_filter != task_uid(tsk)) + return 0; + + return 1; +} + + +static const struct utrace_engine_ops process_trace_ops; + +static void process_trace_tryattach(struct task_struct *tsk) +{ + struct utrace_engine *engine; + + pr_debug("in %s\n", __func__); + engine = utrace_attach_task(tsk, + UTRACE_ATTACH_CREATE | + UTRACE_ATTACH_EXCLUSIVE, + &process_trace_ops, NULL); + if (IS_ERR(engine) || (engine == NULL)) { + pr_warning("utrace_attach_task %d (rc %p)\n", + tsk->pid, engine); + } else { + int rc; + + /* We always hook cost-free events. */ + unsigned long events = + UTRACE_EVENT(CLONE) | + UTRACE_EVENT(EXEC) | + UTRACE_EVENT(EXIT); + + /* Penalizing events are individually controlled, so that + utrace doesn't even take the monitored threads off their + fast paths, nor bother call our callbacks. */ + if (trace_syscalls_p) + events |= UTRACE_EVENT_SYSCALL; + if (trace_signals_p) + events |= UTRACE_EVENT_SIGNAL_ALL; + + rc = utrace_set_events(tsk, engine, events); + if (rc == -EINPROGRESS) + rc = utrace_barrier(tsk, engine); + if (rc) + pr_warning("utrace_set_events/barrier rc %d\n", rc); + + utrace_engine_put(engine); + pr_debug("attached in %s to %s(%d)\n", __func__, + tsk->comm, tsk->pid); + } +} + + +u32 process_trace_report_clone(enum utrace_resume_action action, + struct utrace_engine *engine, + struct task_struct *parent, + unsigned long clone_flags, + struct task_struct *child) +{ + if (trace_lifecycle_p && trace_test(parent)) { + struct process_trace_entry ent; + ent.opcode = _UTRACE_EVENT_CLONE; + ent.trace_clone.child = child->pid; + ent.trace_clone.flags = clone_flags; + process_trace(&ent); + } + + process_trace_tryattach(child); + + return UTRACE_RESUME; +} + + +u32 process_trace_report_syscall_entry(u32 action, + struct utrace_engine *engine, + struct task_struct *task, + struct pt_regs *regs) +{ + if (trace_syscalls_p && trace_test(task)) { + struct process_trace_entry ent; + ent.opcode = _UTRACE_EVENT_SYSCALL_ENTRY; + ent.trace_syscall_entry.callno = syscall_get_nr(task, regs); + syscall_get_arguments(task, regs, 0, 6, + ent.trace_syscall_entry.args); + process_trace(&ent); + } + + return UTRACE_RESUME; +} + + +u32 process_trace_report_syscall_exit(enum utrace_resume_action action, + struct utrace_engine *engine, + struct task_struct *task, + struct pt_regs *regs) +{ + if (trace_syscalls_p && trace_test(task)) { + struct process_trace_entry ent; + ent.opcode = _UTRACE_EVENT_SYSCALL_EXIT; + ent.trace_syscall_exit.rc = + syscall_get_return_value(task, regs); + ent.trace_syscall_exit.error = syscall_get_error(task, regs); + process_trace(&ent); + } + + return UTRACE_RESUME; +} + + +u32 process_trace_report_exec(enum utrace_resume_action action, + struct utrace_engine *engine, + struct task_struct *task, + const struct linux_binfmt *fmt, + const struct linux_binprm *bprm, + struct pt_regs *regs) +{ + if (trace_lifecycle_p && trace_test(task)) { + struct process_trace_entry ent; + ent.opcode = _UTRACE_EVENT_EXEC; + process_trace(&ent); + } + + /* We're already attached; no need for a new tryattach. */ + + return UTRACE_RESUME; +} + + +u32 process_trace_report_signal(u32 action, + struct utrace_engine *engine, + struct task_struct *task, + struct pt_regs *regs, + siginfo_t *info, + const struct k_sigaction *orig_ka, + struct k_sigaction *return_ka) +{ + if (trace_signals_p && trace_test(task)) { + struct process_trace_entry ent; + ent.opcode = _UTRACE_EVENT_SIGNAL; + ent.trace_signal.si_signo = info->si_signo; + ent.trace_signal.si_errno = info->si_errno; + ent.trace_signal.si_code = info->si_code; + process_trace(&ent); + } + + /* We're already attached, so no need for a new tryattach. */ + + return UTRACE_RESUME | utrace_signal_action(action); +} + + +u32 process_trace_report_exit(enum utrace_resume_action action, + struct utrace_engine *engine, + struct task_struct *task, + long orig_code, long *code) +{ + if (trace_lifecycle_p && trace_test(task)) { + struct process_trace_entry ent; + ent.opcode = _UTRACE_EVENT_EXIT; + ent.trace_exit.code = orig_code; + process_trace(&ent); + } + + /* There is no need to explicitly attach or detach here. */ + + return UTRACE_RESUME; +} + + +void enable_process_trace() +{ + struct task_struct *grp, *tsk; + + pr_debug("in %s\n", __func__); + rcu_read_lock(); + do_each_thread(grp, tsk) { + /* Skip over kernel threads. */ + if (tsk->flags & PF_KTHREAD) + continue; + + if (process_follow_pid) { + if (tsk->tgid == process_follow_pid || + tsk->parent->tgid == process_follow_pid) + process_trace_tryattach(tsk); + } else { + process_trace_tryattach(tsk); + } + } while_each_thread(grp, tsk); + rcu_read_unlock(); +} + +void disable_process_trace() +{ + struct utrace_engine *engine; + struct task_struct *grp, *tsk; + int rc; + + pr_debug("in %s\n", __func__); + rcu_read_lock(); + do_each_thread(grp, tsk) { + /* Find matching engine, if any. Returns -ENOENT for + unattached threads. */ + engine = utrace_attach_task(tsk, UTRACE_ATTACH_MATCH_OPS, + &process_trace_ops, 0); + if (IS_ERR(engine)) { + if (PTR_ERR(engine) != -ENOENT) + pr_warning("utrace_attach_task %d (rc %ld)\n", + tsk->pid, -PTR_ERR(engine)); + } else if (engine == NULL) { + pr_warning("utrace_attach_task %d (null engine)\n", + tsk->pid); + } else { + /* Found one of our own engines. Detach. */ + rc = utrace_control(tsk, engine, UTRACE_DETACH); + switch (rc) { + case 0: /* success */ + break; + case -ESRCH: /* REAP callback already begun */ + case -EALREADY: /* DEATH callback already begun */ + break; + default: + rc = -rc; + pr_warning("utrace_detach %d (rc %d)\n", + tsk->pid, rc); + break; + } + utrace_engine_put(engine); + pr_debug("detached in %s from %s(%d)\n", __func__, + tsk->comm, tsk->pid); + } + } while_each_thread(grp, tsk); + rcu_read_unlock(); +} + + +static const struct utrace_engine_ops process_trace_ops = { + .report_clone = process_trace_report_clone, + .report_exec = process_trace_report_exec, + .report_exit = process_trace_report_exit, + .report_signal = process_trace_report_signal, + .report_syscall_entry = process_trace_report_syscall_entry, + .report_syscall_exit = process_trace_report_syscall_exit, +}; + + + +/* control interfaces */ + + +static ssize_t +trace_taskcomm_filter_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + trace_taskcomm_filter, TASK_COMM_LEN); +} + + +static ssize_t +trace_taskcomm_filter_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char *end; + + if (cnt > TASK_COMM_LEN) + cnt = TASK_COMM_LEN; + + if (copy_from_user(trace_taskcomm_filter, ubuf, cnt)) + return -EFAULT; + + /* Cut from the first nil or newline. */ + trace_taskcomm_filter[cnt] = '\0'; + end = strchr(trace_taskcomm_filter, '\n'); + if (end) + *end = '\0'; + + *fpos += cnt; + return cnt; +} + + +static const struct file_operations trace_taskcomm_filter_fops = { + .open = tracing_open_generic, + .read = trace_taskcomm_filter_read, + .write = trace_taskcomm_filter_write, +}; + + + +static char README_text[] = + "process event tracer mini-HOWTO\n" + "\n" + "1. Select process hierarchy to monitor. Other processes will be\n" + " completely unaffected. Leave at 0 for system-wide tracing.\n" + "# echo NNN > process_follow_pid\n" + "\n" + "2. Determine which process event traces are potentially desired.\n" + " syscall and signal tracing slow down monitored processes.\n" + "# echo 0 > process_trace_{syscalls,signals,lifecycle}\n" + "\n" + "3. Add any final uid- or taskcomm-based filtering. Non-matching\n" + " processes will skip trace messages, but will still be slowed.\n" + "# echo NNN > process_trace_uid_filter # -1: unrestricted \n" + "# echo ls > process_trace_taskcomm_filter # empty: unrestricted\n" + "\n" + "4. Start tracing.\n" + "# echo process > current_tracer\n" + "\n" + "5. Examine trace.\n" + "# cat trace\n" + "\n" + "6. Stop tracing.\n" + "# echo nop > current_tracer\n" + ; + +static struct debugfs_blob_wrapper README_blob = { + .data = README_text, + .size = sizeof(README_text), +}; + + +static __init int init_process_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_blob("process_trace_README", 0444, d_tracer, + &README_blob); + if (!entry) + pr_warning("Could not create debugfs " + "'process_trace_README' entry\n"); + + /* Control for scoping process following. */ + entry = debugfs_create_u32("process_follow_pid", 0644, d_tracer, + &process_follow_pid); + if (!entry) + pr_warning("Could not create debugfs " + "'process_follow_pid' entry\n"); + + /* Process-level filters */ + entry = debugfs_create_file("process_trace_taskcomm_filter", 0644, + d_tracer, NULL, + &trace_taskcomm_filter_fops); + /* XXX: it'd be nice to have a read/write debugfs_create_blob. */ + if (!entry) + pr_warning("Could not create debugfs " + "'process_trace_taskcomm_filter' entry\n"); + + entry = debugfs_create_u32("process_trace_uid_filter", 0644, d_tracer, + &trace_taskuid_filter); + if (!entry) + pr_warning("Could not create debugfs " + "'process_trace_uid_filter' entry\n"); + + /* Event-level filters. */ + entry = debugfs_create_u32("process_trace_lifecycle", 0644, d_tracer, + &trace_lifecycle_p); + if (!entry) + pr_warning("Could not create debugfs " + "'process_trace_lifecycle' entry\n"); + + entry = debugfs_create_u32("process_trace_syscalls", 0644, d_tracer, + &trace_syscalls_p); + if (!entry) + pr_warning("Could not create debugfs " + "'process_trace_syscalls' entry\n"); + + entry = debugfs_create_u32("process_trace_signals", 0644, d_tracer, + &trace_signals_p); + if (!entry) + pr_warning("Could not create debugfs " + "'process_trace_signals' entry\n"); + + return register_tracer(&process_tracer); +} + +device_initcall(init_process_trace);