From 888a219f01d706e522209dbb763b5b647afc0989 Mon Sep 17 00:00:00 2001 From: AlbertXu Date: Wed, 13 Aug 2025 14:30:32 +0800 Subject: [PATCH 1/4] bpf_task_state: a BPF-based tool for collecting and analyzing process state transitions, latency, memory usage, IO, and stack traces in Linux. Features: - Collects per-process and system-wide state statistics (running, sleeping, etc.) - Supports filtering by PID and process name (comm), with wildcard and multi-process support - Tracks memory usage, read/write bytes, and page delay metrics (FreePages, Thrashing, SwapIn) - Outputs kernel and user stack traces for each process, with symbol resolution - Supports interval and event-driven sampling, latency threshold filtering, and per-instance statistics - Provides flame graph data for performance analysis Usage examples: - Show all process state statistics every second: perf-prof bpf:task-state -i 1000 - Filter by PID: perf-prof bpf:task-state -p 1234 -i 1000 - Filter by process name (supports wildcards): perf-prof bpf:task-state --filter 'java,python*' -i 1000 - Only show events with latency greater than 20ms and print stack traces: perf-prof bpf:task-state -p 1234 -SD --than 20ms -g - Output per-instance statistics with latency threshold: perf-prof bpf:task-state --perins --than 1ms -i 1000 --- bpf-skel/Build | 1 + bpf-skel/bpf_task_state.bpf.c | 406 ++++++++++++++++++++++ bpf-skel/task_state.h | 113 +++++++ bpf_task_state.c | 616 ++++++++++++++++++++++++++++++++++ tests/test_bpf-task-state.py | 88 +++++ 5 files changed, 1224 insertions(+) create mode 100644 bpf-skel/bpf_task_state.bpf.c create mode 100644 bpf-skel/task_state.h create mode 100644 bpf_task_state.c create mode 100644 tests/test_bpf-task-state.py diff --git a/bpf-skel/Build b/bpf-skel/Build index 6b1070a..744262d 100644 --- a/bpf-skel/Build +++ b/bpf-skel/Build @@ -1 +1,2 @@ perf-prof-$(CONFIG_LIBBPF) += kvm_exit.skel.h +perf-prof-$(CONFIG_LIBBPF) += bpf_task_state.skel.h diff --git a/bpf-skel/bpf_task_state.bpf.c b/bpf-skel/bpf_task_state.bpf.c new file mode 100644 index 0000000..c98a8e5 --- /dev/null +++ b/bpf-skel/bpf_task_state.bpf.c @@ -0,0 +1,406 @@ +#include "vmlinux.h" +#include +#include +#include + +#include "task_state.h" +#include "perf_output.bpf.h" + +/* Filter configuration */ +const volatile struct filters filter = { + .pid = false, + .comm = false, + .comm_num = 0, + .state = -1, + .latency = 0, + .stack = 0, + .perins = false, +}; + +/* BPF maps definition */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10240); + __type(key, pid_t); + __type(value, struct task_state_node); +} task_state_node SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10240); + __type(key, pid_t); + __type(value, struct task_last_state); +} last_task_node SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10240); + __type(key, struct state_key); + __type(value, struct state_info); +} state_info SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(__u32)); + __uint(value_size, MAX_STACKS * sizeof(__u64)); + __uint(max_entries, 10240); +} stack_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, int); + __type(value, int); +} target_tgids SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, char[16]); + __type(value, bool); +} target_comms_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_COMM_FILTER); + __type(key, int); + __type(value, char[16]); +} filter_comms_map SEC(".maps"); + +/* -------------------------------------------------------------------------- */ +/* Utility functions */ +/* -------------------------------------------------------------------------- */ + +/* Update state statistics info */ +static struct state_info *__update_state_info(struct state_key *key, u64 delta) +{ + struct state_info *info = bpf_map_lookup_elem(&state_info, key); + if (info) { + info->calls++; + info->total += delta; + if (delta > info->max) + info->max = delta; + if (delta < info->min || !info->min) + info->min = delta; + return info; + } + struct state_info new_info = { + .calls = 1, + .total = delta, + .min = delta, + .max = delta, + }; + bpf_map_update_elem(&state_info, key, &new_info, BPF_ANY); + return bpf_map_lookup_elem(&state_info, key); +} + +/* Update state statistics for process or system */ +static struct state_info *update_state_info(pid_t pid, int state, u64 delta) +{ + struct state_key key = { .pid = pid, .state = state }; + return __update_state_info(&key, delta); +} + +/* Get process comm */ +static __always_inline int get_task_comm(char *comm, struct task_struct *task) +{ + __builtin_memset(comm, 0, TASK_COMM_LEN); + return bpf_core_read_str(comm, TASK_COMM_LEN, &task->comm); +} + +/* Prefix match for comm filter */ +static __always_inline bool comm_prefix_match(const char comm[TASK_COMM_LEN], + const char pattern[TASK_COMM_LEN]) +{ +#pragma unroll + for (int i = 0; i < TASK_COMM_LEN; i++) { + char pc = pattern[i]; + char cc = comm[i]; + + if (pc == '*') + return true; + if (pc == '\0') + return cc == '\0'; + if (cc == '\0') + return false; + if (pc != cc) + return false; + } + return true; +} + +/* Check if comm matches target */ +static __always_inline bool is_target_comm(const char comm[TASK_COMM_LEN]) +{ + bool *exists = bpf_map_lookup_elem(&target_comms_map, comm); + if (exists) + return true; + + for (int i = 0; i < MAX_COMM_FILTER; i++) { + if (i >= filter.comm_num) + break; + int key = i; + const char *pattern = bpf_map_lookup_elem(&filter_comms_map, &key); + if (!pattern) + continue; + if (comm_prefix_match(comm, pattern)) { + bool one = 1; + bpf_map_update_elem(&target_comms_map, comm, &one, BPF_ANY); + return true; + } + } + return false; +} + +/* Output stacktrace event to perf buffer */ +static __always_inline void update_info_to_perf(void *ctx, struct task_state_node *task_node, u64 latency, u64 total_delay) +{ + struct stacktrace_event event = {}; + event.pid = task_node->pid; + event.state = task_node->last_state; + event.latency = latency; + event.total_delay = total_delay; + + if (filter.stack) { + event.last_user_stack_id = task_node->last_user_stack_id; + event.last_kern_stack_id = task_node->last_kern_stack_id; + event.user_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_USER_STACK); + event.kern_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_FAST_STACK_CMP); + } + perf_output(ctx, &event, sizeof(event)); +} + +/* Update process state node */ +static __always_inline void update_task_state_node(struct task_state_node *node, int pid, int last_state, u64 time, struct task_struct *task, void *ctx) +{ + if (last_state != TASK_RUNNING) { + struct percpu_counter *rss = BPF_CORE_READ(task, mm, rss_stat); + node->curr_state_info.memused = rss[0].count + rss[1].count + rss[3].count; + node->curr_state_info.readchar = BPF_CORE_READ(task, ioac.rchar); + node->curr_state_info.writechar = BPF_CORE_READ(task, ioac.wchar); + node->curr_state_info.freepages_delay = BPF_CORE_READ(task, delays, freepages_delay); + node->curr_state_info.thrashing_delay = BPF_CORE_READ(task, delays, thrashing_delay); + node->curr_state_info.swapin_delay = BPF_CORE_READ(task, delays, swapin_delay); + } + node->last_time = time; + node->last_state = last_state; + node->priority = BPF_CORE_READ(task, normal_prio); + get_task_comm(node->comm, task); + if (filter.stack) { + node->last_user_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_USER_STACK); + node->last_kern_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_FAST_STACK_CMP); + } + bpf_map_update_elem(&task_state_node, &pid, node, BPF_ANY); +} + +/* Handle process context switch */ +SEC("tp_btf/sched_switch") +void BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + u64 time = bpf_ktime_get_ns(); + pid_t prev_pid = BPF_CORE_READ(prev, pid); + pid_t next_pid = BPF_CORE_READ(next, pid); + int prev_tgid = BPF_CORE_READ(prev, tgid); + int next_tgid = BPF_CORE_READ(next, tgid); + int prev_state = BPF_CORE_READ(prev, __state); + int next_state = BPF_CORE_READ(next, __state); + char prev_comm[TASK_COMM_LEN], next_comm[TASK_COMM_LEN]; + struct task_state_node *prev_node, *next_node; + + /* Filter by pid or comm */ + if (filter.pid) { + if (bpf_map_lookup_elem(&target_tgids, &prev_tgid)) + goto record_prev_info; + else if (bpf_map_lookup_elem(&target_tgids, &next_tgid)) + goto record_next_info; + else + return; + } else if (filter.comm) { + get_task_comm(prev_comm, prev); + get_task_comm(next_comm, next); + if (is_target_comm(prev_comm)) + goto record_prev_info; + else if (is_target_comm(next_comm)) + goto record_next_info; + else + return; + } + +record_prev_info: + /* Update prev process running time and state */ + if (prev_pid > 0) { + prev_node = bpf_map_lookup_elem(&task_state_node, &prev_pid); + if (prev_node) { + if (filter.state < 0 && prev_node->last_state == TASK_RUNNING && + time > prev_node->last_time && prev_pid > 0 && prev_pid != next_pid) { + u64 delta = time - prev_node->last_time; + if (filter.pid || filter.comm || filter.perins) { + struct state_info *info = update_state_info(prev_pid, TASK_RUNNING, delta); + if (prev_node && filter.latency > 0 && info && + (info->total > filter.latency || delta > filter.latency)) + update_info_to_perf(ctx, prev_node, delta, info->total); + if (!prev_node->has_state_info) + prev_node->has_state_info = true; + } + if (!(filter.pid || filter.comm || filter.perins)) + update_state_info(-1, TASK_RUNNING, delta); + } + update_task_state_node(prev_node, prev_pid, prev_state, time, prev, ctx); + } else if (prev_pid > 0 && prev_pid != next_pid) { + struct task_state_node new_node = { + .pid = prev_pid, + .tgid = prev_tgid, + .has_state_info = false, + }; + update_task_state_node(&new_node, prev_pid, prev_state, time, prev, ctx); + } + if (filter.pid || filter.comm) + return; + } + +record_next_info: + /* Update next process RUNDELAY statistics */ + if (next_pid > 0) { + next_node = bpf_map_lookup_elem(&task_state_node, &next_pid); + if (next_node) { + if (filter.state < 0 && next_node->last_state == TASK_RUNNING && + time > next_node->last_time && next_pid > 0 && prev_pid != next_pid) { + u64 delta = time - next_node->last_time; + if (filter.pid || filter.comm || filter.perins) { + struct state_info *info = update_state_info(next_pid, RUNDELAY, delta); + if (next_node && filter.latency > 0 && info && + (info->total > filter.latency || delta > filter.latency)) + update_info_to_perf(ctx, next_node, delta, info->total); + if (!next_node->has_state_info) + next_node->has_state_info = true; + } + if (!(filter.pid || filter.comm)) + update_state_info(-1, RUNDELAY, delta); + } + update_task_state_node(next_node, next_pid, TASK_RUNNING, time, next, ctx); + } else if (next_pid > 0 && prev_pid != next_pid) { + struct task_state_node new_node = { + .pid = next_pid, + .tgid = next_tgid, + .has_state_info = false, + }; + update_task_state_node(&new_node, next_pid, TASK_RUNNING, time, next, ctx); + } + } +} + +/* Handle process wakeup (from wait queue to ready queue) */ +SEC("tp_btf/sched_wakeup") +void BPF_PROG(sched_wakeup, struct task_struct *task) +{ + u64 time = bpf_ktime_get_ns(); + pid_t pid = BPF_CORE_READ(task, pid); + int tgid = BPF_CORE_READ(task, tgid); + char comm[TASK_COMM_LEN]; + struct task_state_node *node = NULL; + + /* Filter by pid or comm */ + if (filter.pid) { + if (bpf_map_lookup_elem(&target_tgids, &tgid)) + goto record_info; + else + return; + } else if (filter.comm) { + get_task_comm(comm, task); + if (is_target_comm(comm)) + goto record_info; + else + return; + } + +record_info: + node = bpf_map_lookup_elem(&task_state_node, &pid); + if (pid <= 0) + return; + if (node) { + if (filter.state != TASK_NO_INTERRUPTIBLE && + (filter.state == -1 || (filter.state & node->last_state) == node->last_state) && + node->last_state && time > node->last_time && pid > 0) { + u64 delta = time - node->last_time; + if (filter.pid || filter.comm || filter.perins) { + struct state_info *info = update_state_info(pid, node->last_state, delta); + if (node && filter.latency > 0 && info && + (info->total > filter.latency || delta > filter.latency)) + update_info_to_perf(ctx, node, delta, info->total); + if (!node->has_state_info) + node->has_state_info = true; + } + if (!(filter.pid || filter.comm)) { + update_state_info(-1, node->last_state, delta); + } + } + update_task_state_node(node, pid, TASK_RUNNING, time, task, ctx); + } else { + struct task_state_node new_node = { + .pid = pid, + .tgid = tgid, + .last_time = time, + .has_state_info = false, + }; + update_task_state_node(&new_node, pid, TASK_RUNNING, time, task, ctx); + } +} + +/* Handle new process wakeup (enter ready queue) */ +SEC("tp_btf/sched_wakeup_new") +void BPF_PROG(sched_wakeup_new, struct task_struct *task) +{ + u64 time = bpf_ktime_get_ns(); + pid_t pid = BPF_CORE_READ(task, pid); + int tgid = BPF_CORE_READ(task, tgid); + char comm[TASK_COMM_LEN]; + struct task_state_node *node = NULL; + + /* Filter by pid or comm */ + if (filter.pid) { + if (bpf_map_lookup_elem(&target_tgids, &tgid)) + goto record_info; + else + return; + } else if (filter.comm) { + get_task_comm(comm, task); + if (is_target_comm(comm)) + goto record_info; + else + return; + } + +record_info: + if (pid <= 0) + return; + node = bpf_map_lookup_elem(&task_state_node, &pid); + if (node) { + if (filter.state != TASK_NO_INTERRUPTIBLE && + (filter.state == -1 || (filter.state & node->last_state) == node->last_state) && + node->last_state && time > node->last_time && pid > 0) { + u64 delta = time - node->last_time; + if (filter.pid || filter.comm || filter.perins) { + struct state_info *info = update_state_info(pid, node->last_state, delta); + if (node && filter.latency > 0 && info && + (info->total > filter.latency || delta > filter.latency)) + update_info_to_perf(ctx, node, delta, info->total); + if (!node->has_state_info) + node->has_state_info = true; + } + if (!(filter.pid || filter.comm)) { + update_state_info(-1, node->last_state, delta); + } + } + update_task_state_node(node, pid, TASK_RUNNING, time, task, ctx); + } else { + struct task_state_node new_node = { + .pid = pid, + .tgid = tgid, + .last_time = time, + .has_state_info = false, + }; + update_task_state_node(&new_node, pid, TASK_RUNNING, time, task, ctx); + } +} + +char LICENSE[] SEC("license") = "GPL"; \ No newline at end of file diff --git a/bpf-skel/task_state.h b/bpf-skel/task_state.h new file mode 100644 index 0000000..65265ef --- /dev/null +++ b/bpf-skel/task_state.h @@ -0,0 +1,113 @@ +#ifndef __TASK_STATE_H__ +#define __TASK_STATE_H__ + +#include +#include + +/* Task comm string length */ +#define TASK_COMM_LEN 16 + +/* Task state flags (used in tsk->state) */ +#define TASK_RUNNING 0x00000000 +#define TASK_INTERRUPTIBLE 0x00000001 +#define TASK_UNINTERRUPTIBLE 0x00000002 +#define __TASK_STOPPED 0x00000004 +#define __TASK_TRACED 0x00000008 + +/* Task exit state flags (used in tsk->exit_state) */ +#define EXIT_DEAD 0x00000010 +#define EXIT_ZOMBIE 0x00000020 +#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) + +/* Additional task state flags */ +#define TASK_PARKED 0x00000040 +#define TASK_DEAD 0x00000080 +#define TASK_WAKEKILL 0x00000100 +#define TASK_WAKING 0x00000200 +#define TASK_NOLOAD 0x00000400 +#define TASK_NEW 0x00000800 +#define TASK_RTLOCK_WAIT 0x00001000 +#define TASK_FREEZABLE 0x00002000 +#define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) +#define TASK_FROZEN 0x00008000 +#define TASK_STATE_MAX 0x00010000 +#define RUNDELAY (TASK_STATE_MAX << 1) +#define TASK_ANY (TASK_STATE_MAX - 1) + +/* Composite state macros */ +#define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) +#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) +#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) +#define TASK_TRACED __TASK_TRACED +#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) +#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) +#define TASK_NO_INTERRUPTIBLE -2 + +/* Stack and filter limits */ +#define MAX_STACKS 32 // Maximum stack depth +#define MAX_COMM_FILTER 16 // Maximum number of comm filters + +struct filters { + bool pid; // Filter by pid + bool comm; // Filter by comm + int comm_num; // Number of comm filters + int state; // Task state filter + uint64_t latency; // Latency threshold + int stack; // Stack trace enabled + bool perins; // Per-process statistics enabled +}; + +/* Key for state statistics map */ +struct state_key { + pid_t pid; // -1 for system, pid for process + int state; // Task state +}; + +/* Value for state statistics map */ +struct state_info { + int calls; // Number of calls + unsigned long total; // Total time spent in this state (us) + unsigned long min; // Minimum time spent in this state (us) + unsigned long max; // Maximum time spent in this state (us) + unsigned long p50; // 50th percentile (us) + unsigned long p95; // 95th percentile (us) + unsigned long p99; // 99th percentile (us) +}; + +/* Last state info for a task */ +struct task_last_state { + int memused; // Memory used + uint64_t readchar; // Bytes read + uint64_t writechar; // Bytes written + uint64_t freepages_delay; // Free pages delay + uint64_t thrashing_delay; // Thrashing delay + uint64_t swapin_delay; // Swap-in delay +}; + +/* State node for a task */ +struct task_state_node { + pid_t pid; // Process ID + pid_t tgid; // Thread Group ID + char comm[TASK_COMM_LEN]; // Command name + unsigned long long last_time; // Timestamp (ns) + int last_state; // Last state + int last_user_stack_id; // Last user stack ID + int last_kern_stack_id; // Last kernel stack ID + bool has_state_info; // State info available + int priority; // Task priority + struct task_last_state curr_state_info; // Current state info +}; + +/* Stack trace event structure */ +struct stacktrace_event { + pid_t pid; // Process ID + uint64_t latency; // Latency (ns) + uint64_t total_delay; // Total delay (ns) + int state; // Task state + int last_user_stack_id; // Last user stack ID + int last_kern_stack_id; // Last kernel stack ID + int user_stack_id; // User stack ID + int kern_stack_id; // Kernel stack ID +}; + +#endif /* __TASK_STATE_H__ */ \ No newline at end of file diff --git a/bpf_task_state.c b/bpf_task_state.c new file mode 100644 index 0000000..2b2e020 --- /dev/null +++ b/bpf_task_state.c @@ -0,0 +1,616 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct taskstate_ctx { + struct bpf_task_state_bpf *obj; + struct perf_thread_map *thread_map; + struct perf_evsel *evsel; + struct ksyms *ksyms; + struct syms_cache *syms_cache; +}; + +// static struct ksyms *ksyms = NULL; + +/* Initialize monitor context */ +static int monitor_ctx_init(struct prof_dev *dev) +{ + struct taskstate_ctx *ctx = zalloc(sizeof(*ctx)); + if (!ctx) + return -1; + dev->private = ctx; + + ctx->obj = bpf_task_state_bpf__open(); + if (!ctx->obj) { + printf("Failed to open BPF skeleton\n"); + goto free_ctx; + } + return 0; +free_ctx: + free(ctx); + return -1; +} + +/* Cleanup monitor context */ +static void monitor_ctx_exit(struct prof_dev *dev) +{ + struct taskstate_ctx *ctx = dev->private; + bpf_task_state_bpf__detach(ctx->obj); + bpf_task_state_bpf__destroy(ctx->obj); + free(ctx); +} + +/* Get TGID for a given TID */ +static int get_tgid(int tid) +{ + char path[256], line[256]; + FILE *fp; + int pid = 0; + + snprintf(path, sizeof(path), "/proc/%d/status", tid); + fp = fopen(path, "r"); + if (!fp) + return -1; + + while (fgets(line, sizeof(line), fp)) { + if (sscanf(line, "Tgid: %d", &pid) == 1) + break; + pid = 0; + } + fclose(fp); + + return pid; +} + +/* Count number of process names in filter string */ +static int count_process_names(const char *filter_str) +{ + int count = 0; + char *filter = strdup(filter_str); + char *s = filter; + char *sep; + + while ((sep = strchr(s, ',')) != NULL) { + *sep = '\0'; + if (*s) + count++; + s = sep + 1; + } + if (*s) + count++; + free(filter); + return count; +} + +/* Pattern match for comm string */ +static int comm_pattern_match(const char *comm, const char *pattern) +{ + for (int i = 0; i < TASK_COMM_LEN; i++) { + char pc = pattern[i]; + char cc = comm[i]; + if (pc == '*') return 1; + if (pc == '\0') return cc == '\0'; + if (cc == '\0') return 0; + if (pc != cc) return 0; + } + return 1; +} + +/* Initialize filter_comms_map with filter string */ +static int init_filter_comms_map(struct bpf_map *filter_comms_map, const char *filter_str) +{ + char *filter = strdup(filter_str); + char *s; + char *sep; + char comm[16]; + int idx = 0; + if (!filter) return -1; + s = filter; + while ((sep = strchr(s, ',')) != NULL && idx < MAX_COMM_FILTER) { + *sep = '\0'; + if (*s) { + memset(comm, 0, sizeof(comm)); + strncpy(comm, s, 15); + bpf_map__update_elem(filter_comms_map, &idx, sizeof(idx), comm, sizeof(comm), 0); + idx++; + } + s = sep + 1; + } + if (*s && idx < MAX_COMM_FILTER) { + memset(comm, 0, sizeof(comm)); + strncpy(comm, s, 15); + bpf_map__update_elem(filter_comms_map, &idx, sizeof(idx), comm, sizeof(comm), 0); + idx++; + } + free(filter); + return 0; +} + +/* Initialize target_comms_map by scanning /proc and matching filter patterns */ +static int init_target_comms_map(struct bpf_map *filter_comms_map, int filter_cnt, struct bpf_map *target_comms_map) +{ + DIR *proc; + struct dirent *entry; + char comm_path[300], comm[16], pattern[16]; + bool dummy = true; + proc = opendir("/proc"); + if (!proc) return -1; + while ((entry = readdir(proc)) != NULL) { + FILE *f; + if (!isdigit(entry->d_name[0])) + continue; + snprintf(comm_path, sizeof(comm_path), "/proc/%s/comm", entry->d_name); + f = fopen(comm_path, "r"); + if (!f) continue; + if (!fgets(comm, sizeof(comm), f)) { + fclose(f); + continue; + } + fclose(f); + comm[strcspn(comm, "\n")] = 0; + for (int i = 0; i < filter_cnt; i++) { + memset(pattern, 0, sizeof(pattern)); + if (bpf_map__lookup_elem(filter_comms_map, &i, sizeof(i), pattern, sizeof(pattern), 0) == 0) { + if (comm_pattern_match(comm, pattern)) { + bpf_map__update_elem(target_comms_map, comm, sizeof(comm), &dummy, sizeof(dummy), 0); + break; + } + } + } + } + closedir(proc); + return 0; +} + +static int bpf_task_state_init(struct prof_dev *dev) +{ + struct taskstate_ctx *ctx; + struct env *env = dev->env; + struct perf_evlist *evlist = dev->evlist; + struct bpf_map *filter_comms_map; + struct bpf_map *target_comms_map; + int comm_num; + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_BPF_OUTPUT, + .size = sizeof(struct perf_event_attr), + .sample_period = 1, + .sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW, + .read_format = 0, + .pinned = 1, + .disabled = 0, + .watermark = 1, + .wakeup_watermark = 1, + }; + + if (monitor_ctx_init(dev) < 0) + return -1; + + ctx = dev->private; + + prof_dev_env2attr(dev, &attr); + ctx->evsel = perf_evsel__new(&attr); + if (!ctx->evsel) + goto failed; + perf_evlist__add(evlist, ctx->evsel); + + ctx->obj->rodata->filter.pid = env->pids ? true : false; + ctx->obj->rodata->filter.comm = env->filter ? true : false; + ctx->obj->rodata->filter.perins = env->perins ? true : false; + ctx->obj->rodata->filter.latency = env->greater_than ? env->greater_than : ctx->obj->rodata->filter.latency; + ctx->obj->rodata->filter.stack = env->callchain ? true : false; + + if (env->interruptible && env->uninterruptible) { + ctx->obj->rodata->filter.state = TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE; + } else if (env->interruptible) { + ctx->obj->rodata->filter.state = TASK_INTERRUPTIBLE; + } else if (env->uninterruptible) { + ctx->obj->rodata->filter.state = TASK_UNINTERRUPTIBLE; + } else if (env->interruptible_set && !env->interruptible){ + ctx->obj->rodata->filter.state = TASK_NO_INTERRUPTIBLE; + } + + if (env->filter) { + comm_num = count_process_names(env->filter); + ctx->obj->rodata->filter.comm_num = comm_num; + } + + if (bpf_task_state_bpf__load(ctx->obj)){ + fprintf(stderr, "Failed to load and verify BPF skeleton\n"); + goto failed; + } + + filter_comms_map = ctx->obj->maps.filter_comms_map; + target_comms_map = ctx->obj->maps.target_comms_map; + if (env->filter) { + if (init_filter_comms_map(filter_comms_map, env->filter) < 0) { + fprintf(stderr, "Failed to initialize filter_comms_map\n"); + return -1; + } + if (init_target_comms_map(filter_comms_map, comm_num, target_comms_map) < 0) { + fprintf(stderr, "Failed to initialize target_comms_map\n"); + return -1; + } + } + return 0; + +failed: + monitor_ctx_exit(dev); + return -1; +} + +#define FD(e, x, y) ((int *) xyarray__entry(e->fd, x, y)) +static int bpf_task_state_filter(struct prof_dev *dev) +{ + struct taskstate_ctx *ctx = dev->private; + struct bpf_map *target_tgids = ctx->obj->maps.target_tgids; + int idx, tid, err, cpu, ins, perf_event_fd; + + ctx->thread_map = dev->threads; + perf_cpu_map__put(dev->cpus); + dev->cpus = perf_cpu_map__new(NULL); + dev->threads = perf_thread_map__new_dummy(); + + if (ctx->thread_map) { + perf_thread_map__for_each_thread(tid, idx, ctx->thread_map) { + int tgid = get_tgid(tid); + if (tgid > 0) { + bpf_map__update_elem(target_tgids, &tgid, sizeof(tgid), &tgid, sizeof(tgid), 0); + } + } + } + + perf_cpu_map__for_each_cpu(cpu, ins, dev->cpus) { + tid = 0; idx = 0; + perf_thread_map__for_each_thread(tid, idx, ctx->thread_map) { + int *fd_ptr; + fd_ptr = FD(ctx->evsel, ins, idx); + if (!fd_ptr) { + continue; + } + perf_event_fd = *fd_ptr; + err = bpf_map__update_elem(ctx->obj->maps.perf_events, &cpu, sizeof(cpu), &perf_event_fd, sizeof(perf_event_fd), 0); + if (err < 0) { + printf("Failed to update perf_events map\n"); + return -1; + } + } + } + + ctx->ksyms = ksyms__load(); + if (!ctx->ksyms) { + printf("Failed to create ksyms map\n"); + return -1; + } + ctx->syms_cache = syms_cache__new(); + if (!ctx->syms_cache) { + printf("Failed to create syms_cache map\n"); + ksyms__free(ctx->ksyms); + return -1; + } + + err = bpf_task_state_bpf__attach(ctx->obj); + if (err) { + printf("Failed to attach BPF skeleton\n"); + } + return 1; +} + +/* Convert state value to string */ +static const char *state_to_str(int state) +{ + switch (state) { + case TASK_RUNNING: return "R"; + case TASK_INTERRUPTIBLE: return "S"; + case TASK_UNINTERRUPTIBLE: return "D"; + case __TASK_STOPPED: return "T"; + case TASK_IDLE: return "I"; + case RUNDELAY: return "RD"; + default: return "?"; + } +} + +/* Print state statistics for a process */ +static void print_state_info(struct bpf_map *map, pid_t pid) +{ + struct state_key key = { .pid = pid }; + struct state_info value = {0}; + static const int states[] = { + TASK_RUNNING, + TASK_INTERRUPTIBLE, + TASK_UNINTERRUPTIBLE, + __TASK_STOPPED, + TASK_IDLE, + RUNDELAY, + }; + static const char *state_strs[] = { "R", "S", "D", "T", "I", "RD" }; + bool has_data = false; + + printf("%3s %10s %12s %12s %12s %12s\n", + "St", "calls", "total(us)", "min(us)", "max(us)", "avg(us)"); + printf("%3s %10s %12s %12s %12s %12s\n", + "---", "----------", "------------", "------------", "------------", "------------"); + for (int i = 0; i < (int)(sizeof(states)/sizeof(states[0])); i++) { + key.state = states[i]; + if (bpf_map__lookup_elem(map, &key, sizeof(key), &value, sizeof(value), 0) == 0 && value.calls > 0) { + has_data = true; + printf("%3s %10u %12.3f %12.3f %12.3f %12.3f\n", + state_strs[i], + value.calls, + value.total / 1000.0, + value.min / 1000.0, + value.max / 1000.0, + value.calls ? (double)value.total / value.calls / 1000.0 : 0.0 + ); + bpf_map__delete_elem(map, &key, sizeof(key), 0); + } + } + if (!has_data) + printf("%3s %10s %12s %12s %12s %12s\n", + "NULL", "NULL", "NULL", "NULL", "NULL", "NULL"); + printf("\n"); +} + +/* Print kernel symbol for a given address */ +static void print_ksym(__u64 addr, struct taskstate_ctx *ctx) +{ + const struct ksym *sym; + if (!addr) + return; + sym = ksyms__map_addr(ctx->ksyms, addr); + if (!sym) { + printf(" 0x%llx Unknown", addr); + } else { + printf(" 0x%llx %s+0x%llx", addr, sym->name, addr - sym->addr); + } + printf(" ([kernel.kallsysms])\n"); +} + +/* Print user symbol for a given address */ +static int print_usym(__u64 addr, pid_t pid, struct taskstate_ctx *ctx) +{ + struct syms *syms; + struct dso *dso = NULL; + const struct sym *sym = NULL; + uint64_t offset = 0; + const char *symbol = "Unknown"; + int unknown = 0; + + // 获取用户符号缓存 + syms = syms_cache__get_syms(ctx->syms_cache, pid); + if (syms) { + dso = syms__find_dso(syms, addr, &offset); + if (dso) { + sym = dso__find_sym(dso, offset); + if (sym) { + symbol = sym__name(sym); + offset = offset - sym->start; + unknown = dso__name(dso) ? 0 : 1; + } + } else + unknown = 1; + } + printf(" 0x%llx %s+0x%lx (%s)\n", addr, symbol, offset, unknown? "Unknown" : dso__name(dso)); + return unknown; +} + +/* Print stack trace for a given stack id */ +static void print_stack(int fd, int key, bool kernel_user_stack, pid_t pid, struct taskstate_ctx *ctx) +{ + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; + int i, unknown = 0; + if (bpf_map_lookup_elem(fd, &key, &ip) != 0) { + printf("not find;\n"); + } else { + if(!kernel_user_stack) + for (i = 0; i < PERF_MAX_STACK_DEPTH; i++) + print_ksym(ip[i], ctx); + else + for (i = 0; i < PERF_MAX_STACK_DEPTH; i++){ + if(unknown < 2) + unknown += print_usym(ip[i], pid, ctx); + } + } +} + +/* Print all stack traces for a stacktrace_event */ +static void print_stacks(int fd, struct stacktrace_event *info, pid_t pid, struct taskstate_ctx *ctx) +{ + if (info->last_kern_stack_id) { + printf("Last kernel stack trace:\n"); + print_stack(fd, info->last_kern_stack_id, 0, pid, ctx); + } + if (info->last_user_stack_id) { + printf("Last user stack trace:\n"); + print_stack(fd, info->last_user_stack_id, 1, pid, ctx); + } + if (info->kern_stack_id) { + printf("Kernel stack trace:\n"); + print_stack(fd, info->kern_stack_id, 0, pid, ctx); + } + if (info->user_stack_id) { + printf("User stack trace:\n"); + print_stack(fd, info->user_stack_id, 1, pid, ctx); + } +} + +/* Print statistics for each process at interval */ +static void bpf_task_state_interval(struct prof_dev *dev) +{ + struct taskstate_ctx *ctx = dev->private; + if (dev->env->pids || dev->env->filter) { + pid_t key = -1, next_key = -1; + struct task_state_node task_node = {0}; + int task_state_node_fd = bpf_map__fd(ctx->obj->maps.task_state_node); + int last_task_node_fd = bpf_map__fd(ctx->obj->maps.last_task_node); + bool has_print_title = false; + if (bpf_map_get_next_key(task_state_node_fd, &key, &next_key)) { + return; + } + while (!bpf_map_get_next_key(task_state_node_fd, &key, &next_key)) { + if (bpf_map_lookup_elem(task_state_node_fd, &next_key, &task_node) < 0) { + printf("Failed to lookup task state node for PID in interval%d\n", next_key); + } else { + if (task_node.has_state_info) { + struct task_last_state last_task_node = {0}; + unsigned long memtotal = sysconf(_SC_PHYS_PAGES); + if (!has_print_title) { + printf("\n======================== Per-Process State Statistics ============================\n"); + print_time(stdout); + printf("\n"); + has_print_title = true; + } + printf("TGID: %d, PID: %d, COMM: %s, PRIO: %d\n", + task_node.tgid, task_node.pid, task_node.comm, task_node.priority); + bpf_map_lookup_elem(last_task_node_fd, &next_key, &last_task_node); + printf("MEM: %.3f, Read: %.3fkb, Write: %.3fkb\n", + (task_node.curr_state_info.memused * 100.0 / memtotal), + 1.0 * (task_node.curr_state_info.readchar - last_task_node.readchar) / 1024, + 1.0 * (task_node.curr_state_info.writechar - last_task_node.writechar) / 1024); + if(task_node.curr_state_info.freepages_delay - last_task_node.freepages_delay) + printf("FreePages: %ld ", + task_node.curr_state_info.freepages_delay - last_task_node.freepages_delay); + if(task_node.curr_state_info.thrashing_delay - last_task_node.thrashing_delay) + printf("Thrashing: %ld ", + task_node.curr_state_info.thrashing_delay - last_task_node.thrashing_delay); + if(task_node.curr_state_info.swapin_delay - last_task_node.swapin_delay) + printf("SwapIn: %ld ", + task_node.curr_state_info.swapin_delay - last_task_node.swapin_delay); + if (task_node.curr_state_info.freepages_delay - last_task_node.freepages_delay || + task_node.curr_state_info.thrashing_delay - last_task_node.thrashing_delay || + task_node.curr_state_info.swapin_delay - last_task_node.swapin_delay) + printf("\n"); + print_state_info(ctx->obj->maps.state_info, task_node.pid); + + last_task_node.memused = task_node.curr_state_info.memused; + last_task_node.readchar = task_node.curr_state_info.readchar; + last_task_node.writechar = task_node.curr_state_info.writechar; + last_task_node.freepages_delay = task_node.curr_state_info.freepages_delay; + last_task_node.thrashing_delay = task_node.curr_state_info.thrashing_delay; + last_task_node.swapin_delay = task_node.curr_state_info.swapin_delay; + bpf_map_update_elem(last_task_node_fd, &next_key, &last_task_node, BPF_ANY); + + if ((task_node.last_state & TASK_DEAD) || + (task_node.last_state & EXIT_DEAD) || + (task_node.last_state & EXIT_ZOMBIE)) { + printf("[DeadInfo]: %d dead at %u\n", + task_node.pid, (unsigned int)task_node.last_time); + bpf_map_delete_elem(task_state_node_fd, &next_key); + } else if (task_node.last_state & EXIT_ZOMBIE) { + printf("[ZombieInfo]: %d Zombie at %u not dead\n", + task_node.pid, (unsigned int)task_node.last_time); + task_node.has_state_info = false; + bpf_map_update_elem(task_state_node_fd, &next_key, &task_node, BPF_ANY); + } else { + task_node.has_state_info = false; + bpf_map_update_elem(task_state_node_fd, &next_key, &task_node, BPF_ANY); + } + } + } + key = next_key; + } + } else { + printf("========================= SYS-State Statistics ====================\n"); + print_time(stdout); + printf("\n"); + print_state_info(ctx->obj->maps.state_info, -1); + printf("\n"); + } +} + +/* Print statistics for a process when sample event is triggered */ +static void bpf_task_state_sample(struct prof_dev *dev, union perf_event *event, int instance) +{ + struct taskstate_ctx *ctx = dev->private; + struct stacktrace_event *info = (void *)event->sample.array + sizeof(u64) + sizeof(u32); + struct task_state_node task_node = {0}; + int task_state_fd = bpf_map__fd(ctx->obj->maps.task_state_node); + int stack_map_fd = bpf_map__fd(ctx->obj->maps.stack_map); + bool has_print_title = false; + if (dev->env->greater_than) { + if (bpf_map_lookup_elem(task_state_fd, &info->pid, &task_node) < 0) { + printf("Failed to lookup task state node for PID in sample%d\n", info->pid); + } else { + if (task_node.has_state_info) { + if (!has_print_title) { + printf("\n======================== Per-Process State Statistics ============================\n"); + print_time(stdout); + printf("\n"); + has_print_title = true; + } + printf("TGID: %d, PID: %d, COMM: %s, STATE: %s, PRIO: %d, latency: %luus, total_delay: %lums\n", + task_node.tgid, task_node.pid, task_node.comm, state_to_str(info->state), task_node.priority, + info->latency / NSEC_PER_USEC, info->total_delay / NSEC_PER_MSEC); + print_state_info(ctx->obj->maps.state_info, task_node.pid); + if (dev->env->callchain) + print_stacks(stack_map_fd, info, task_node.pid, ctx); + + task_node.has_state_info = false; + bpf_map_update_elem(task_state_fd, &info->pid, &task_node, BPF_ANY); + } + } + } +} + +/* Cleanup and print interval statistics */ +static void bpf_task_state_deinit(struct prof_dev *dev) +{ + bpf_task_state_interval(dev); + monitor_ctx_exit(dev); +} + +static const char *bpf_task_state_desc[] = PROFILER_DESC("bpf:task-state", + "[OPTION...] [-S] [-D] [--filter comm] [--perins] [--than ns] [-g [--flame-graph file]]", + "Show task state statistics via BPF.", + "", + "BPF-TASK-STATE", + " int pid # process id", + " int state # task state", + " u64 last_time # last state change time", + "", + "EXAMPLES", + " "PROGRAME" bpf:task-state -i 1000", + " "PROGRAME" bpf:task-state -p 1234 -i 1000", + " "PROGRAME" bpf:task-state --filter 'java,python*' -i 1000", + " "PROGRAME" bpf:task-state -i 1000 --no-interruptible", + " "PROGRAME" bpf:task-state -p 1234 -SD --than 20ms -g", + " "PROGRAME" bpf:task-state --perins --than 1ms -i 1000" +); + +static const char *bpf_task_state_argv[] = PROFILER_ARGV("bpf:task-state", + PROFILER_ARGV_OPTION, "inherit", + PROFILER_ARGV_PROFILER, "interruptible", "uninterruptible", + "filter\nFilter process comm", + "perins", "than", "call-graph", "bpf-python-callchain" +); + +struct monitor bpf_task_state = { + .name = "bpf:task-state", + .desc = bpf_task_state_desc, + .argv = bpf_task_state_argv, + .pages = 8, + .init = bpf_task_state_init, + .filter = bpf_task_state_filter, + .deinit = bpf_task_state_deinit, + .interval = bpf_task_state_interval, + .sample = bpf_task_state_sample, +}; + +MONITOR_REGISTER(bpf_task_state) \ No newline at end of file diff --git a/tests/test_bpf-task-state.py b/tests/test_bpf-task-state.py new file mode 100644 index 0000000..1b11b24 --- /dev/null +++ b/tests/test_bpf-task-state.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +from PerfProf import PerfProf +from conftest import result_check + +def test_task_state_mode0(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 + task_state = PerfProf(['bpf:task-state', '-i', '1000']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode1(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep' + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode2(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 -SD + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode3(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 -SD --filter 'python,sh,bash,ls,ps,awk,grep' + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD', '--filter', 'python,sh,bash,ls,ps,awk,grep']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode0_NOS(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 --no-interruptible + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--no-interruptible']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode1_NOS(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep' --no-interruptible + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--no-interruptible']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode1_than(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode1_than_perins(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s --perins + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s', '--perins']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode3_than(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 -SD --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_mode3_than_perins(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 -m 256 -SD --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s --perins + task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s', '--perins']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_filter(runtime, memleak_check): + # perf-prof bpf:task-state --filter 'java,python*' -S --than 100ms -g + task_state = PerfProf(['bpf:task-state', '--filter', 'java,python*', '-S', '--than', '100ms', '-g']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_pthread1(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 --perins -m 64 -- ./pthread --loop 10 --depth 3 + task_state = PerfProf(['bpf:task-state', '-i', '1000', '--perins', '-m', '64', '--', './pthread', '--loop', '10', '--depth', '3']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_pthread2(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 --perins -m 64 -- ./pthread --depth 2 + task_state = PerfProf(['bpf:task-state', '-i', '1000', '--perins', '-m', '64', '--', './pthread', '--depth', '2']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) + +def test_task_state_pthread3(runtime, memleak_check): + # perf-prof bpf:task-state -i 1000 --perins -m 64 -- ./pthread --daemonize --depth 5 + task_state = PerfProf(['bpf:task-state', '-i', '1000', '--perins', '-m', '64', '--', './pthread', '--daemonize', '--depth', '5']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) From 2b6464686b35c5963172551dff8883fa2154a4bb Mon Sep 17 00:00:00 2001 From: AlbertXu Date: Thu, 14 Aug 2025 10:43:30 +0800 Subject: [PATCH 2/4] bpf-pystack: Uses eBPF technology to maintain the call stacks of Python threads outputs python call stacks together with kernel-stack and user-stacks at appropriate times. Compared to pystack, bpf-pystack supports stack maintenance for newly created processes. Usage examples: --bpf-python-callchain (--bpf-python-callchain /usr/local/lib/libpython3.6m.so.1.0) --- Build | 2 + bpf-skel/Build | 1 + bpf-skel/bpf_pystack.bpf.c | 209 +++++++++++++++++++++++++++ bpf-skel/bpf_pystack.h | 18 +++ bpf_pystack.c | 158 ++++++++++++++++++++ monitor.c | 4 + monitor.h | 9 +- pystack.c | 166 ++++++++++++++------- task-state.c | 2 +- tests/test_--bpf-python-callchain.py | 10 ++ 10 files changed, 527 insertions(+), 52 deletions(-) create mode 100644 bpf-skel/bpf_pystack.bpf.c create mode 100644 bpf-skel/bpf_pystack.h create mode 100644 bpf_pystack.c create mode 100644 tests/test_--bpf-python-callchain.py diff --git a/Build b/Build index 93d72c6..35a84fb 100644 --- a/Build +++ b/Build @@ -38,6 +38,8 @@ perf-prof-y += tlbstat.o perf-prof-y += list.o perf-prof-y += kcore.o perf-prof-$(CONFIG_LIBBPF) += bpf_kvm_exit.o +perf-prof-$(CONFIG_LIBBPF) += bpf_task_state.o +perf-prof-$(CONFIG_LIBBPF) += bpf_pystack.o bin-y += perf-prof diff --git a/bpf-skel/Build b/bpf-skel/Build index 744262d..8d28243 100644 --- a/bpf-skel/Build +++ b/bpf-skel/Build @@ -1,2 +1,3 @@ perf-prof-$(CONFIG_LIBBPF) += kvm_exit.skel.h perf-prof-$(CONFIG_LIBBPF) += bpf_task_state.skel.h +perf-prof-$(CONFIG_LIBBPF) += bpf_pystack.skel.h \ No newline at end of file diff --git a/bpf-skel/bpf_pystack.bpf.c b/bpf-skel/bpf_pystack.bpf.c new file mode 100644 index 0000000..53a3195 --- /dev/null +++ b/bpf-skel/bpf_pystack.bpf.c @@ -0,0 +1,209 @@ +#include "vmlinux.h" +#include //包含了BPF 辅助函数 +#include +#include +#include "bpf_pystack.h" +#include + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +// 只读区定义格式字符串 +const char fmt[] SEC(".rodata") = "%s %s"; +// 用于插入 thread_stack 的全零结构体 +const struct stack_t zero_stack = {.depth = 1}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(u64)); + __uint(value_size, MAX_STR_LEN); + __uint(max_entries, 1024); +} str_id_to_str SEC(".maps"); + +// 线程调用栈map,key=tid,value=stack_t +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct stack_t)); + __uint(max_entries, 10240); +} thread_stack SEC(".maps"); + +/*将字符串转换成hash值*/ +static __always_inline u64 fnv1a_hash(const char *str, int len) { + u64 hash = 14695981039346656037ULL; + #pragma unroll + for (int i = 0; i < MAX_STR_LEN; i++) { + char c = str[i]; + if (c == 0) + break; + hash ^= c; + hash *= 1099511628211ULL; + } + return hash; +} + +/*字符串比较*/ +static __always_inline int my_bpf_strncmp(const char *s1, char *s2, int n) +{ + #pragma unroll + for (int i = 0; i < n; i++) { + char c1 = s1[i]; + char c2 = s2[i]; + if (c1 != c2) + return 1; + if (c1 == '\0') + return 0; + } + return 0; +} + +static __always_inline int fill_func_buf(char *buf, size_t buf_sz, long filename_ptr, long funcname_ptr) { + __u64 data[2]; + data[0] = filename_ptr; + data[1] = funcname_ptr; + return bpf_snprintf(buf, buf_sz, fmt, data, sizeof(data)); +} + +/* 公共:字符串 hash 并查找/插入 map,返回 map value 指针或 NULL */ +static __always_inline char *get_or_insert_funcstr(u64 *hash_out, char *buf, int len) { + u64 hash = fnv1a_hash(buf, len); + char *filefunc = bpf_map_lookup_elem(&str_id_to_str, &hash); + if (!filefunc) { + if (len >= MAX_STR_LEN) + return NULL; + bpf_map_update_elem(&str_id_to_str, &hash, buf, BPF_ANY); + *hash_out = hash; + return buf; + } else { + if (my_bpf_strncmp(filefunc, buf, MAX_STR_LEN) != 0){ + bpf_printk("hash err String mismatch: %s != %s\n", filefunc, buf); + return NULL; + } + *hash_out = hash; + return filefunc; + } +} + +/* 公共:获取线程栈指针,自动初始化 */ +static __always_inline struct stack_t *get_or_init_stack(u32 tid) { + struct stack_t *stack = bpf_map_lookup_elem(&thread_stack, &tid); + if (!stack) { + bpf_map_update_elem(&thread_stack, &tid, &zero_stack, BPF_ANY); + stack = bpf_map_lookup_elem(&thread_stack, &tid); + if (!stack) { + bpf_printk("Failed to get or init stack for tid %d\n", tid); + return NULL; + } + } + return stack; +} + +/* 公共:push 到调用栈 */ +static __always_inline int stack_push(struct stack_t *stack, u64 hash, int lineno, u64 now) { + if (stack->depth < MAX_STACK_DEPTH) { + stack->pystack[stack->depth].filefunc = hash; + stack->pystack[stack->depth].lineno = lineno; + stack->pystack[stack->depth].time = now; + bpf_printk("[entry] push function_entry: %llu %d depth %d\n", + stack->pystack[stack->depth].filefunc, + stack->pystack[stack->depth].lineno, + stack->depth); + stack->depth++; + return 0; + } + return -1; +} + +/* 公共:pop 并校验 hash */ +static __always_inline int stack_pop_and_check(struct stack_t *stack, u64 hash, u64 time) { + if (stack->depth > 1) { + stack->depth--; + int cur_depth = stack->depth; + if (cur_depth < 1 || cur_depth >= MAX_STACK_DEPTH) { + stack->depth++; // 恢复深度 + bpf_printk("[return] stack underflow or overflow for tid\n"); + return -1; + } + // 用cur_depth做所有数组访问 + if (cur_depth > 0 && cur_depth < MAX_STACK_DEPTH) { + if (stack->pystack[cur_depth].filefunc != hash) { + stack->depth++; // 恢复深度 + bpf_printk("[return] hash %llu != %llu filefunc \n", hash, stack->pystack[cur_depth].filefunc); + return -2; + } + u64 delay = time - stack->pystack[cur_depth].time; + if (delay > stack->pystack[0].time) { + stack->pystack[0].filefunc = stack->pystack[cur_depth].filefunc; + stack->pystack[0].lineno = stack->pystack[cur_depth].lineno; + stack->pystack[0].time = delay; + bpf_printk("[return] %lu - %lu = %lu \n", time, stack->pystack[cur_depth].time, delay); + } + bpf_printk("[return] pop sucess function_return: %llu %d delay %llu depth %d \n", + stack->pystack[cur_depth].filefunc, + stack->pystack[cur_depth].lineno, + delay, + cur_depth); + return 1; + } + } + return 0; +} + +SEC("usdt/python:function__entry") +int BPF_USDT(probe_function_entry,long filename_ptr,long funcname_ptr,long lineno) { + char buf[128]; + long ret; + u64 hash,now; + char *filefunc; + now = bpf_ktime_get_ns(); + + /*1. 将filenme funcname 字符串拼接为buf中*/ + ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr); + if (ret <= 0) return 0; + + //bpf_printk("[entry]:%s %d\n", buf, lineno); + + /*2. 将buf字符串转换为hash数值,并存入str_id_to_str中*/ + filefunc = get_or_insert_funcstr(&hash, buf, ret); + if (!filefunc) return 0; + + /*3. 获取当前线程thread_stack*/ + u32 tid = bpf_get_current_pid_tgid(); + struct stack_t *stack = get_or_init_stack(tid); + if (!stack) return 0; + + /*4. push id到调用栈*/ + if (stack_push(stack, hash, lineno, now) < 0) + bpf_printk("[entry] stack overflow for tid %d\n", tid); + + return 0; +} + +SEC("usdt/python:function__return") +int BPF_USDT(probe_function_return,long filename_ptr,long funcname_ptr,long lineno) { + char buf[128]; + long ret; + u64 hash, now; + char *filefunc; + now = bpf_ktime_get_ns(); + + /*1. 将filenme funcname 字符串拼接为buf中*/ + ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr); + if (ret <= 0) return 0; + + //bpf_printk("[return]:%s %d\n", buf, lineno); + + /*2. 将buf字符串转换为hash数值,并存入str_id_to_str中*/ + filefunc = get_or_insert_funcstr(&hash, buf, ret); + if (!filefunc) return 0; + + /*3. 获取当前线程thread_stack*/ + u32 tid = bpf_get_current_pid_tgid(); + struct stack_t *stack = get_or_init_stack(tid); + if (!stack) return 0; + + /*4. pop id到调用栈*/ + stack_pop_and_check(stack, hash, now); + + return 0; +} + diff --git a/bpf-skel/bpf_pystack.h b/bpf-skel/bpf_pystack.h new file mode 100644 index 0000000..f679860 --- /dev/null +++ b/bpf-skel/bpf_pystack.h @@ -0,0 +1,18 @@ +#include +#include + +// 最大调用栈深度 +#define MAX_STACK_DEPTH 32 +// 最大字符串长度 +#define MAX_STR_LEN 128 + +struct perstack{ + u64 filefunc; + u64 time; + int lineno; +}; +// 线程调用栈结构体 +struct stack_t { + u32 depth; + struct perstack pystack[MAX_STACK_DEPTH]; +}; \ No newline at end of file diff --git a/bpf_pystack.c b/bpf_pystack.c new file mode 100644 index 0000000..a47c244 --- /dev/null +++ b/bpf_pystack.c @@ -0,0 +1,158 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace_helpers.h" + +struct bpf_pystack_ctx { + struct bpf_pystack_bpf *obj; + void *fixed_event; + struct perf_evsel *evsel; + //char binary_path[BINARY_PATH_SIZE]; +}; +static const char *object; +static int monitor_ctx_init(struct prof_dev *dev) +{ + struct bpf_pystack_ctx *ctx = zalloc(sizeof(*ctx)); + if (!ctx) + return -1; + dev->private = ctx; + + ctx->obj = bpf_pystack_bpf__open(); + if (!ctx->obj) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + goto free_ctx; + } + return 0; +free_ctx: + free(ctx); + return -1; +} + +static void monitor_ctx_exit(struct prof_dev *dev) +{ + struct bpf_pystack_ctx *ctx = dev->private; + bpf_pystack_bpf__destroy(ctx->obj); + free(ctx->fixed_event); + free(ctx); +} + +static int bpf_pystack_init(struct prof_dev *dev) +{ + struct bpf_pystack_ctx *ctx; + struct env *env = dev->env; + struct perf_evlist *evlist = dev->evlist; + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_BPF_OUTPUT, + .size = sizeof(struct perf_event_attr), + .sample_type = PERF_SAMPLE_TIME, + .sample_period = 1, + .pinned = 1, + .disabled = 1, + .watermark = 1, + }; + if (monitor_ctx_init(dev) < 0) + return -1; + ctx = dev->private; + prof_dev_env2attr(dev, &attr); + + ctx->evsel = perf_evsel__new(&attr); + if (!ctx->evsel) + goto failed; + + perf_evlist__add(evlist, ctx->evsel); + if (bpf_pystack_bpf__load(ctx->obj)) + goto failed; + + ctx->fixed_event = malloc(PERF_SAMPLE_MAX_SIZE); + if (!ctx->fixed_event){ + printf("Failed to allocate fixed_event buffer\n"); + goto deinit; + } + object = env->bpf_python_callchain; + printf("bpf_pystack object: %s\n", object); + return 0; + +failed: + monitor_ctx_exit(dev); +deinit: + monitor_ctx_exit(dev); + return -1; +} + +static int bpf_pystack_filter(struct prof_dev *dev) +{ + struct bpf_pystack_ctx *ctx = dev->private; + int err; + + ctx->obj->links.probe_function_entry = bpf_program__attach_usdt(ctx->obj->progs.probe_function_entry, -1, + object, "python", "function__entry", NULL); + if (!ctx->obj->links.probe_function_entry) { + err = errno; + fprintf(stderr, "attach usdt probe_function_entry failed: %s\n", strerror(err)); + } + + ctx->obj->links.probe_function_return = bpf_program__attach_usdt(ctx->obj->progs.probe_function_return, -1, + object, "python", "function__return", NULL); + if (!ctx->obj->links.probe_function_return) { + err = errno; + fprintf(stderr, "attach usdt probe_function__return failed: %s\n", strerror(err)); + } + return 1; +} + +struct monitor bpf_pystack = { + .name = "bpf-pystack", + .pages = 4, + .init = bpf_pystack_init, + .filter = bpf_pystack_filter, + .deinit = monitor_ctx_exit, +}; +MONITOR_REGISTER(bpf_pystack) + +int bpf_pystack_link(struct prof_dev *main_dev) +{ + struct prof_dev *bpf_pydev; + struct env *e; + if (main_dev->prof == &bpf_pystack) + return 0; + e = clone_env(main_dev->env); + if (!e) + return -1; + + // Specifies the parent of pydev so that the real main_dev can be found + // when heap-sorting pydev events. See order_main_dev(). + bpf_pydev = prof_dev_open_cpu_thread_map(&bpf_pystack, e, NULL, NULL, main_dev); + if (!bpf_pydev){ + printf("fail to open bpf_pystack device\n"); + return -1; + } + + if (order_together(main_dev, bpf_pydev) < 0) { + prof_dev_close(bpf_pydev); + return -1; + } + main_dev->links.pystack = bpf_pydev; + return 0; +} + +void bpf_pystack_unlink(struct prof_dev *main_dev) +{ + if (main_dev->links.pystack) { + prof_dev_close(main_dev->links.pystack); + main_dev->links.pystack = NULL; + } +} diff --git a/monitor.c b/monitor.c index a2dcf4c..af32636 100644 --- a/monitor.c +++ b/monitor.c @@ -469,6 +469,7 @@ struct option main_options[] = { OPT_BOOLEAN_SET ( 0 , "user-callchain", &env.user_callchain, &env.user_callchain_set, "include user callchains, no- prefix to exclude"), OPT_BOOLEAN_SET ( 0 , "kernel-callchain", &env.kernel_callchain, &env.kernel_callchain_set, "include kernel callchains, no- prefix to exclude"), OPT_BOOL_NONEG ( 0 , "python-callchain", &env.python_callchain, "include python callchains"), + OPT_STRDUP_NONEG(0, "bpf-python-callchain", &env.bpf_python_callchain, "path of your libpython (/usr/lib64/libpython3.6m.so.1.0)", "include python callchains by bpf, extra string"), OPT_INT_OPTARG_SET( 0 , "irqs_disabled", &env.irqs_disabled, &env.irqs_disabled_set, 1, "0|1", "ebpf, irqs disabled or not."), OPT_INT_OPTARG_SET( 0 , "tif_need_resched", &env.tif_need_resched, &env.tif_need_resched_set, 1, "0|1", "ebpf, TIF_NEED_RESCHED is set or not."), OPT_INT_NONEG_SET ( 0 , "exclude_pid", &env.exclude_pid, &env.exclude_pid_set, "pid", "ebpf, exclude pid"), @@ -2196,6 +2197,9 @@ struct prof_dev *prof_dev_open_internal(profiler *prof, struct env *env, if (env->python_callchain) if (pystack_link(dev) < 0) goto out_order_deinit; + if (env->bpf_python_callchain) + if (bpf_pystack_link(dev) < 0) + goto out_order_deinit; } if (dev->env->interval) { diff --git a/monitor.h b/monitor.h index e2c206d..ce4c263 100644 --- a/monitor.h +++ b/monitor.h @@ -139,6 +139,8 @@ struct env { bool user_callchain, user_callchain_set; bool kernel_callchain, kernel_callchain_set; bool python_callchain; + char *bpf_python_callchain; + //bool bpf_python_callchain; // ebpf bool irqs_disabled_set, tif_need_resched_set, exclude_pid_set; bool nr_running_min_set, nr_running_max_set; @@ -577,11 +579,11 @@ perfclock_t prof_dev_list_minevtime(void); "usage-self", "sampling-limit", "perfeval-cpus", "perfeval-pids", "version", "verbose", "quiet", "help" #define PROFILER_ARGV_FILTER \ "FILTER OPTION:", \ - "exclude-host", "exclude-guest", "exclude-user", "exclude-kernel", "python-callchain", \ + "exclude-host", "exclude-guest", "exclude-user", "exclude-kernel", "python-callchain", "bpf-python-callchain",\ "user-callchain", "kernel-callchain", \ "irqs_disabled", "tif_need_resched", "exclude_pid", "nr_running_min", "nr_running_max" #define PROFILER_ARGV_CALLCHAIN_FILTER \ - "FILTER OPTION:", "user-callchain", "kernel-callchain", "python-callchain" + "FILTER OPTION:", "user-callchain", "kernel-callchain", "python-callchain", "bpf-python-callchain" #define PROFILER_ARGV_PROFILER \ "PROFILER OPTION:" \ @@ -708,4 +710,7 @@ void pystack_unlink(struct prof_dev *main_dev); union perf_event * pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *writable, int reserved); +//bpf_pystack.c +int bpf_pystack_link(struct prof_dev *main_dev); +void bpf_pystack_unlink(struct prof_dev *main_dev); #endif diff --git a/pystack.c b/pystack.c index 244d9b8..9387b17 100644 --- a/pystack.c +++ b/pystack.c @@ -5,6 +5,10 @@ #include #include #include +#include +#include +#include +#include #define PYSTACK_MAX_DEPTH 128 @@ -248,13 +252,25 @@ static void pystack_sample(struct prof_dev *dev, union perf_event *event, int in } __packed raw; } *data = (void *)event->sample.array; unsigned short common_type = data->raw.common_type; - struct function__entry_return *function = &data->raw.function; - const char *filename = (const char *)function + function->filename_offset; - const char *funcname = (const char *)function + function->funcname_offset; - int lineno = function->lineno; + // struct function__entry_return *function = &data->raw.function; + // const char *filename = (const char *)function + function->filename_offset; + // const char *funcname = (const char *)function + function->funcname_offset; + // int lineno = function->lineno; + // struct pystack_node tmp, *node; + // struct rb_node *rbn; + // char buf[4096]; + struct function__entry_return function_tmp; + struct function__entry_return *function; + const char *filename,*funcname; + int lineno; struct pystack_node tmp, *node; struct rb_node *rbn; char buf[4096]; + memcpy(&function_tmp, &data->raw.function, sizeof(function_tmp)); + function = &function_tmp; + filename = (const char *)function + function->filename_offset; + funcname = (const char *)function + function->funcname_offset; + lineno = function->lineno; if (!function->common_pid) return; @@ -267,9 +283,11 @@ static void pystack_sample(struct prof_dev *dev, union perf_event *event, int in node = rb_entry_safe(rbn, struct pystack_node, rbnode); if (node) { int n = snprintf(buf, sizeof(buf), "%s (%s:%d)", funcname, filename, lineno); + printf("[pystack in stack] %s,%s,%d\n", funcname, filename, lineno); if (common_type == ctx->fun_entry) { if (node->depth < PYSTACK_MAX_DEPTH) { const char *str = unique_string_len(buf, n>sizeof(buf) ? sizeof(buf)-1 : n); + //printf("[pystack in stack] %s\n", str); node->stack[node->depth] = str; } node->depth++; @@ -335,9 +353,6 @@ union perf_event * pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *writable, int reserved) { struct prof_dev *pydev = main_dev->links.pystack; - struct pystack_ctx *ctx; - struct pystack_node tmp, *node; - struct rb_node *rbn; void *data; int pid; bool callchain; @@ -348,10 +363,11 @@ pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *wri return event; data = (void *)event->sample.array; + pid = *(u32 *)(data + main_dev->pos.tid_pos + sizeof(u32)); if (pid == 0) return event; - + callchain = main_dev->env->callchain; if (!callchain && main_dev->pos.id_pos >= 0) { struct perf_evsel *evsel; @@ -362,47 +378,99 @@ pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *wri if (!callchain) return event; - ctx = pydev->private; - tmp.pid = pid; - rbn = rblist__find(&ctx->pystack, &tmp); - node = rb_entry_safe(rbn, struct pystack_node, rbnode); - if (node) { - union perf_event *new_event = ctx->fixed_event + reserved; - struct callchain *cc = data + main_dev->pos.callchain_pos; - int copy_len = (void *)&cc->ips[cc->nr] - (void *)event; - int depth = node->depth; - int d; - - /* - * Generate new events: put PERF_CONTEXT_PYSTACK at the end of - * the callchain. - * - * { u64 nr, - * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN - * - * Default callchain context order: - * PERF_CONTEXT_KERNEL - * PERF_CONTEXT_USER - * PERF_CONTEXT_PYSTACK => Contains only the unique string: - * "funcname (filename:lineno)" - */ - memcpy(new_event, event, copy_len); - data = (void *)new_event->sample.array; - cc = data + main_dev->pos.callchain_pos; - cc->ips[cc->nr++] = PERF_CONTEXT_PYSTACK; - - if (depth > PYSTACK_MAX_DEPTH) - depth = PYSTACK_MAX_DEPTH; - d = depth; - while (depth-- > 0) - cc->ips[cc->nr++] = (u64)node->stack[depth]; - - if (event->header.size > copy_len) - memcpy((void *)&cc->ips[cc->nr], (void *)event + copy_len, event->header.size - copy_len); - - new_event->header.size += (d+1) * sizeof(u64); - *writable = 1; - return new_event; + if(main_dev->env->python_callchain){ + struct pystack_ctx *ctx; + struct pystack_node tmp, *node; + struct rb_node *rbn; + tmp.pid = pid; + ctx = pydev->private; + rbn = rblist__find(&ctx->pystack, &tmp); + node = rb_entry_safe(rbn, struct pystack_node, rbnode); + if (node) { + union perf_event *new_event = ctx->fixed_event + reserved; + struct callchain *cc = data + main_dev->pos.callchain_pos; + int copy_len = (void *)&cc->ips[cc->nr] - (void *)event; + int depth = node->depth; + int d; + + memcpy(new_event, event, copy_len); + data = (void *)new_event->sample.array; + cc = data + main_dev->pos.callchain_pos; + cc->ips[cc->nr++] = PERF_CONTEXT_PYSTACK; + + if (depth > PYSTACK_MAX_DEPTH) + depth = PYSTACK_MAX_DEPTH; + d = depth; + + while (depth-- > 0) + cc->ips[cc->nr++] = (u64)node->stack[depth]; + + if (event->header.size > copy_len) + memcpy((void *)&cc->ips[cc->nr], (void *)event + copy_len, event->header.size - copy_len); + new_event->header.size += (d+1) * sizeof(u64); + *writable = 1; + return new_event; + } + } + else if(main_dev->env->bpf_python_callchain){ + struct bpf_pystack_ctx { + struct bpf_pystack_bpf *obj; + void *fixed_event; + } *ctx = pydev->private; + struct stack_t node = {0}; + + int str_id_to_str_fd, thread_stack_fd; + str_id_to_str_fd = bpf_map__fd(ctx->obj->maps.str_id_to_str); + thread_stack_fd = bpf_map__fd(ctx->obj->maps.thread_stack); + + if(bpf_map_lookup_elem(thread_stack_fd, &pid, &node)<0){ + return event; + }else if(node.depth <=0 || node.depth > PYSTACK_MAX_DEPTH){ + printf("tid %d python callchain depth is %d, which is invalid faild, skip this event.\n", pid,node.depth); + return event; + }else{ + union perf_event *new_event = ctx->fixed_event + reserved; + struct callchain *cc = data + main_dev->pos.callchain_pos; + int copy_len = (void *)&cc->ips[cc->nr] - (void *)event; + int depth = node.depth; + int d; + + if (copy_len <= 0 || copy_len > event->header.size) { + printf("Invalid copy_len: %d, event->header.size: %u\n", copy_len, event->header.size); + return event; + } + memcpy(new_event, event, copy_len); + data = (void *)new_event->sample.array; + cc = data + main_dev->pos.callchain_pos; + cc->ips[cc->nr++] = PERF_CONTEXT_PYSTACK; + + if (depth > PYSTACK_MAX_DEPTH) + depth = PYSTACK_MAX_DEPTH; + d = depth; + while (depth-- > 0){ + struct perstack *perstack = &node.pystack[depth]; + char filefunc[MAX_STR_LEN] = {0}; + char buf[4096]; + const char *str; + int len, ret = bpf_map_lookup_elem(str_id_to_str_fd, &perstack->filefunc, filefunc); + if (ret<0) { + printf("fail to find value from str_id_to_str_fd: %lu\n", perstack->filefunc); + continue; + } + if(depth == 0) + len = snprintf(buf, sizeof(buf), "%s:%d (%.2fms) [Max dealy, not in current stack]", filefunc, perstack->lineno, + (double)(perstack->time/1000000.00)); + else + len = snprintf(buf, sizeof(buf), "%s: %d", filefunc, perstack->lineno); + str = unique_string_len(buf, len>sizeof(buf) ? sizeof(buf)-1 : len); + cc->ips[cc->nr++] = (u64)str; + } + if (event->header.size > copy_len) + memcpy((void *)&cc->ips[cc->nr], (void *)event + copy_len, event->header.size - copy_len); + new_event->header.size += (d+1) * sizeof(u64); + *writable = 1; + return new_event; + } } return event; } diff --git a/task-state.c b/task-state.c index 506477f..f2a66de 100644 --- a/task-state.c +++ b/task-state.c @@ -972,7 +972,7 @@ static const char *task_state_argv[] = PROFILER_ARGV("task-state", PROFILER_ARGV_OPTION, PROFILER_ARGV_CALLCHAIN_FILTER, PROFILER_ARGV_PROFILER, "interruptible", "uninterruptible", "than", "filter\nFilter process comm", - "perins", "call-graph", "flame-graph", "ptrace"); + "perins", "call-graph", "flame-graph", "ptrace", "bpf-python-callchain", "python-callchain"); struct monitor task_state = { .name = "task-state", .desc = task_state_desc, diff --git a/tests/test_--bpf-python-callchain.py b/tests/test_--bpf-python-callchain.py new file mode 100644 index 0000000..532c9b7 --- /dev/null +++ b/tests/test_--bpf-python-callchain.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 + +from PerfProf import PerfProf +from conftest import result_check + +def test_bpf_pystack_taskstate(runtime, memleak_check): + # perf-prof task-state --filter 'java,python*' -i 1000 --than 100ms -g --bpf-python-callchain /usr/local/lib/libpython3.6m.so.1.0 + task_state = PerfProf(['task-state', '--filter', 'java,python*', '-i', '1000', '--than', '100ms', '-g', '--bpf-python-callchain', '/usr/local/lib/libpython3.6m.so.1.0']) + for std, line in task_state.run(runtime, memleak_check): + result_check(std, line, runtime, memleak_check) \ No newline at end of file From 6d4c0ebdb4ab0d6b487b1589d9e3879cbac208fd Mon Sep 17 00:00:00 2001 From: albertxu216 <1418106416@qq.com> Date: Thu, 14 Aug 2025 10:48:23 +0800 Subject: [PATCH 3/4] bpf-pystack: Uses eBPF technology to maintain the call stacks of Python threads bpf-pystack: Uses eBPF technology to maintain the call stacks of Python threads Outputs python-stack together with kernel and user call stacks at appropriate times. Compared to pystack, bpf-pystack supports stack maintenance for newly created processes. --- bpf-skel/bpf_pystack.bpf.c | 50 ++++---------------------------------- 1 file changed, 5 insertions(+), 45 deletions(-) diff --git a/bpf-skel/bpf_pystack.bpf.c b/bpf-skel/bpf_pystack.bpf.c index 53a3195..bd32b4e 100644 --- a/bpf-skel/bpf_pystack.bpf.c +++ b/bpf-skel/bpf_pystack.bpf.c @@ -1,5 +1,5 @@ #include "vmlinux.h" -#include //包含了BPF 辅助函数 +#include #include #include #include "bpf_pystack.h" @@ -7,9 +7,7 @@ char LICENSE[] SEC("license") = "Dual BSD/GPL"; -// 只读区定义格式字符串 const char fmt[] SEC(".rodata") = "%s %s"; -// 用于插入 thread_stack 的全零结构体 const struct stack_t zero_stack = {.depth = 1}; struct { @@ -19,7 +17,6 @@ struct { __uint(max_entries, 1024); } str_id_to_str SEC(".maps"); -// 线程调用栈map,key=tid,value=stack_t struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(u32)); @@ -27,7 +24,6 @@ struct { __uint(max_entries, 10240); } thread_stack SEC(".maps"); -/*将字符串转换成hash值*/ static __always_inline u64 fnv1a_hash(const char *str, int len) { u64 hash = 14695981039346656037ULL; #pragma unroll @@ -41,7 +37,6 @@ static __always_inline u64 fnv1a_hash(const char *str, int len) { return hash; } -/*字符串比较*/ static __always_inline int my_bpf_strncmp(const char *s1, char *s2, int n) { #pragma unroll @@ -63,7 +58,6 @@ static __always_inline int fill_func_buf(char *buf, size_t buf_sz, long filename return bpf_snprintf(buf, buf_sz, fmt, data, sizeof(data)); } -/* 公共:字符串 hash 并查找/插入 map,返回 map value 指针或 NULL */ static __always_inline char *get_or_insert_funcstr(u64 *hash_out, char *buf, int len) { u64 hash = fnv1a_hash(buf, len); char *filefunc = bpf_map_lookup_elem(&str_id_to_str, &hash); @@ -74,60 +68,46 @@ static __always_inline char *get_or_insert_funcstr(u64 *hash_out, char *buf, int *hash_out = hash; return buf; } else { - if (my_bpf_strncmp(filefunc, buf, MAX_STR_LEN) != 0){ - bpf_printk("hash err String mismatch: %s != %s\n", filefunc, buf); + if (my_bpf_strncmp(filefunc, buf, MAX_STR_LEN) != 0) return NULL; - } *hash_out = hash; return filefunc; } } -/* 公共:获取线程栈指针,自动初始化 */ static __always_inline struct stack_t *get_or_init_stack(u32 tid) { struct stack_t *stack = bpf_map_lookup_elem(&thread_stack, &tid); if (!stack) { bpf_map_update_elem(&thread_stack, &tid, &zero_stack, BPF_ANY); stack = bpf_map_lookup_elem(&thread_stack, &tid); - if (!stack) { - bpf_printk("Failed to get or init stack for tid %d\n", tid); + if (!stack) return NULL; - } } return stack; } -/* 公共:push 到调用栈 */ static __always_inline int stack_push(struct stack_t *stack, u64 hash, int lineno, u64 now) { if (stack->depth < MAX_STACK_DEPTH) { stack->pystack[stack->depth].filefunc = hash; stack->pystack[stack->depth].lineno = lineno; stack->pystack[stack->depth].time = now; - bpf_printk("[entry] push function_entry: %llu %d depth %d\n", - stack->pystack[stack->depth].filefunc, - stack->pystack[stack->depth].lineno, - stack->depth); stack->depth++; return 0; } return -1; } -/* 公共:pop 并校验 hash */ static __always_inline int stack_pop_and_check(struct stack_t *stack, u64 hash, u64 time) { if (stack->depth > 1) { stack->depth--; int cur_depth = stack->depth; if (cur_depth < 1 || cur_depth >= MAX_STACK_DEPTH) { - stack->depth++; // 恢复深度 - bpf_printk("[return] stack underflow or overflow for tid\n"); + stack->depth++; return -1; } - // 用cur_depth做所有数组访问 if (cur_depth > 0 && cur_depth < MAX_STACK_DEPTH) { if (stack->pystack[cur_depth].filefunc != hash) { - stack->depth++; // 恢复深度 - bpf_printk("[return] hash %llu != %llu filefunc \n", hash, stack->pystack[cur_depth].filefunc); + stack->depth++; return -2; } u64 delay = time - stack->pystack[cur_depth].time; @@ -135,13 +115,7 @@ static __always_inline int stack_pop_and_check(struct stack_t *stack, u64 hash, stack->pystack[0].filefunc = stack->pystack[cur_depth].filefunc; stack->pystack[0].lineno = stack->pystack[cur_depth].lineno; stack->pystack[0].time = delay; - bpf_printk("[return] %lu - %lu = %lu \n", time, stack->pystack[cur_depth].time, delay); } - bpf_printk("[return] pop sucess function_return: %llu %d delay %llu depth %d \n", - stack->pystack[cur_depth].filefunc, - stack->pystack[cur_depth].lineno, - delay, - cur_depth); return 1; } } @@ -156,22 +130,16 @@ int BPF_USDT(probe_function_entry,long filename_ptr,long funcname_ptr,long linen char *filefunc; now = bpf_ktime_get_ns(); - /*1. 将filenme funcname 字符串拼接为buf中*/ ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr); if (ret <= 0) return 0; - //bpf_printk("[entry]:%s %d\n", buf, lineno); - - /*2. 将buf字符串转换为hash数值,并存入str_id_to_str中*/ filefunc = get_or_insert_funcstr(&hash, buf, ret); if (!filefunc) return 0; - /*3. 获取当前线程thread_stack*/ u32 tid = bpf_get_current_pid_tgid(); struct stack_t *stack = get_or_init_stack(tid); if (!stack) return 0; - /*4. push id到调用栈*/ if (stack_push(stack, hash, lineno, now) < 0) bpf_printk("[entry] stack overflow for tid %d\n", tid); @@ -186,24 +154,16 @@ int BPF_USDT(probe_function_return,long filename_ptr,long funcname_ptr,long line char *filefunc; now = bpf_ktime_get_ns(); - /*1. 将filenme funcname 字符串拼接为buf中*/ ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr); if (ret <= 0) return 0; - - //bpf_printk("[return]:%s %d\n", buf, lineno); - - /*2. 将buf字符串转换为hash数值,并存入str_id_to_str中*/ filefunc = get_or_insert_funcstr(&hash, buf, ret); if (!filefunc) return 0; - /*3. 获取当前线程thread_stack*/ u32 tid = bpf_get_current_pid_tgid(); struct stack_t *stack = get_or_init_stack(tid); if (!stack) return 0; - /*4. pop id到调用栈*/ stack_pop_and_check(stack, hash, now); - return 0; } From a5d0b22f5aa3c19a11774494b5d27c4f8bb4f6f3 Mon Sep 17 00:00:00 2001 From: albertxu216 <1418106416@qq.com> Date: Thu, 14 Aug 2025 10:48:48 +0800 Subject: [PATCH 4/4] Update bpf_pystack.h --- bpf-skel/bpf_pystack.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bpf-skel/bpf_pystack.h b/bpf-skel/bpf_pystack.h index f679860..eb0526f 100644 --- a/bpf-skel/bpf_pystack.h +++ b/bpf-skel/bpf_pystack.h @@ -1,9 +1,7 @@ #include #include -// 最大调用栈深度 #define MAX_STACK_DEPTH 32 -// 最大字符串长度 #define MAX_STR_LEN 128 struct perstack{ @@ -11,8 +9,7 @@ struct perstack{ u64 time; int lineno; }; -// 线程调用栈结构体 struct stack_t { u32 depth; struct perstack pystack[MAX_STACK_DEPTH]; -}; \ No newline at end of file +};