From 888a219f01d706e522209dbb763b5b647afc0989 Mon Sep 17 00:00:00 2001
From: AlbertXu <albert.xuu@foxmail.com>
Date: Wed, 13 Aug 2025 14:30:32 +0800
Subject: [PATCH 1/4] bpf_task_state: a BPF-based tool for collecting and
 analyzing process state transitions, latency, memory usage, IO, and stack
 traces in Linux. Features: - Collects per-process and system-wide state
 statistics (running, sleeping, etc.) - Supports filtering by PID and process
 name (comm), with wildcard and multi-process support - Tracks memory usage,
 read/write bytes, and page delay metrics (FreePages, Thrashing, SwapIn) -
 Outputs kernel and user stack traces for each process, with symbol resolution
 - Supports interval and event-driven sampling, latency threshold filtering,
 and per-instance statistics - Provides flame graph data for performance
 analysis

Usage examples:
- Show all process state statistics every second:
    perf-prof bpf:task-state -i 1000

- Filter by PID:
    perf-prof bpf:task-state -p 1234 -i 1000

- Filter by process name (supports wildcards):
    perf-prof bpf:task-state --filter 'java,python*' -i 1000

- Only show events with latency greater than 20ms and print stack traces:
    perf-prof bpf:task-state -p 1234 -SD --than 20ms -g

- Output per-instance statistics with latency threshold:
    perf-prof bpf:task-state --perins --than 1ms -i 1000
---
 bpf-skel/Build                |   1 +
 bpf-skel/bpf_task_state.bpf.c | 406 ++++++++++++++++++++++
 bpf-skel/task_state.h         | 113 +++++++
 bpf_task_state.c              | 616 ++++++++++++++++++++++++++++++++++
 tests/test_bpf-task-state.py  |  88 +++++
 5 files changed, 1224 insertions(+)
 create mode 100644 bpf-skel/bpf_task_state.bpf.c
 create mode 100644 bpf-skel/task_state.h
 create mode 100644 bpf_task_state.c
 create mode 100644 tests/test_bpf-task-state.py

diff --git a/bpf-skel/Build b/bpf-skel/Build
index 6b1070a..744262d 100644
--- a/bpf-skel/Build
+++ b/bpf-skel/Build
@@ -1 +1,2 @@
 perf-prof-$(CONFIG_LIBBPF) += kvm_exit.skel.h
+perf-prof-$(CONFIG_LIBBPF) += bpf_task_state.skel.h
diff --git a/bpf-skel/bpf_task_state.bpf.c b/bpf-skel/bpf_task_state.bpf.c
new file mode 100644
index 0000000..c98a8e5
--- /dev/null
+++ b/bpf-skel/bpf_task_state.bpf.c
@@ -0,0 +1,406 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+
+#include "task_state.h"
+#include "perf_output.bpf.h"
+
+/* Filter configuration */
+const volatile struct filters filter = {
+    .pid = false,
+    .comm = false,
+    .comm_num = 0,
+    .state = -1,
+    .latency = 0,
+    .stack = 0,
+    .perins = false,
+};
+
+/* BPF maps definition */
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 10240);
+    __type(key, pid_t);
+    __type(value, struct task_state_node);
+} task_state_node SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 10240);
+    __type(key, pid_t);
+    __type(value, struct task_last_state);
+} last_task_node SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 10240);
+    __type(key, struct state_key);
+    __type(value, struct state_info);
+} state_info SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+    __uint(key_size, sizeof(__u32));
+    __uint(value_size, MAX_STACKS * sizeof(__u64));
+    __uint(max_entries, 10240);
+} stack_map SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 1024);
+    __type(key, int);
+    __type(value, int);
+} target_tgids SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 1024);
+    __type(key, char[16]);
+    __type(value, bool);
+} target_comms_map SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_ARRAY);
+    __uint(max_entries, MAX_COMM_FILTER);
+    __type(key, int);
+    __type(value, char[16]);
+} filter_comms_map SEC(".maps");
+
+/* -------------------------------------------------------------------------- */
+/* Utility functions                                                          */
+/* -------------------------------------------------------------------------- */
+
+/* Update state statistics info */
+static struct state_info *__update_state_info(struct state_key *key, u64 delta)
+{
+    struct state_info *info = bpf_map_lookup_elem(&state_info, key);
+    if (info) {
+        info->calls++;
+        info->total += delta;
+        if (delta > info->max)
+            info->max = delta;
+        if (delta < info->min || !info->min)
+            info->min = delta;
+        return info;
+    }
+    struct state_info new_info = {
+        .calls = 1,
+        .total = delta,
+        .min = delta,
+        .max = delta,
+    };
+    bpf_map_update_elem(&state_info, key, &new_info, BPF_ANY);
+    return bpf_map_lookup_elem(&state_info, key);
+}
+
+/* Update state statistics for process or system */
+static struct state_info *update_state_info(pid_t pid, int state, u64 delta)
+{
+    struct state_key key = { .pid = pid, .state = state };
+    return __update_state_info(&key, delta);
+}
+
+/* Get process comm */
+static __always_inline int get_task_comm(char *comm, struct task_struct *task)
+{
+    __builtin_memset(comm, 0, TASK_COMM_LEN);
+    return bpf_core_read_str(comm, TASK_COMM_LEN, &task->comm);
+}
+
+/* Prefix match for comm filter */
+static __always_inline bool comm_prefix_match(const char comm[TASK_COMM_LEN],
+                                              const char pattern[TASK_COMM_LEN])
+{
+#pragma unroll
+    for (int i = 0; i < TASK_COMM_LEN; i++) {
+        char pc = pattern[i];
+        char cc = comm[i];
+
+        if (pc == '*')
+            return true;
+        if (pc == '\0')
+            return cc == '\0';
+        if (cc == '\0')
+            return false;
+        if (pc != cc)
+            return false;
+    }
+    return true;
+}
+
+/* Check if comm matches target */
+static __always_inline bool is_target_comm(const char comm[TASK_COMM_LEN])
+{
+    bool *exists = bpf_map_lookup_elem(&target_comms_map, comm);
+    if (exists)
+        return true;
+
+    for (int i = 0; i < MAX_COMM_FILTER; i++) {
+        if (i >= filter.comm_num)
+            break;
+        int key = i;
+        const char *pattern = bpf_map_lookup_elem(&filter_comms_map, &key);
+        if (!pattern)
+            continue;
+        if (comm_prefix_match(comm, pattern)) {
+            bool one = 1;
+            bpf_map_update_elem(&target_comms_map, comm, &one, BPF_ANY);
+            return true;
+        }
+    }
+    return false;
+}
+
+/* Output stacktrace event to perf buffer */
+static __always_inline void update_info_to_perf(void *ctx, struct task_state_node *task_node, u64 latency, u64 total_delay)
+{
+    struct stacktrace_event event = {};
+    event.pid = task_node->pid;
+    event.state = task_node->last_state;
+    event.latency = latency;
+    event.total_delay = total_delay;
+
+    if (filter.stack) {
+        event.last_user_stack_id = task_node->last_user_stack_id;
+        event.last_kern_stack_id = task_node->last_kern_stack_id;
+        event.user_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_USER_STACK);
+        event.kern_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_FAST_STACK_CMP);
+    }
+    perf_output(ctx, &event, sizeof(event));
+}
+
+/* Update process state node */
+static __always_inline void update_task_state_node(struct task_state_node *node, int pid, int last_state, u64 time, struct task_struct *task, void *ctx)
+{
+    if (last_state != TASK_RUNNING) {
+        struct percpu_counter *rss = BPF_CORE_READ(task, mm, rss_stat);
+        node->curr_state_info.memused = rss[0].count + rss[1].count + rss[3].count;
+        node->curr_state_info.readchar = BPF_CORE_READ(task, ioac.rchar);
+        node->curr_state_info.writechar = BPF_CORE_READ(task, ioac.wchar);
+        node->curr_state_info.freepages_delay = BPF_CORE_READ(task, delays, freepages_delay);
+        node->curr_state_info.thrashing_delay = BPF_CORE_READ(task, delays, thrashing_delay);
+        node->curr_state_info.swapin_delay = BPF_CORE_READ(task, delays, swapin_delay);
+    }
+    node->last_time = time;
+    node->last_state = last_state;
+    node->priority = BPF_CORE_READ(task, normal_prio);
+    get_task_comm(node->comm, task);
+    if (filter.stack) {
+        node->last_user_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_USER_STACK);
+        node->last_kern_stack_id = bpf_get_stackid(ctx, &stack_map, BPF_F_FAST_STACK_CMP);
+    }
+    bpf_map_update_elem(&task_state_node, &pid, node, BPF_ANY);
+}
+
+/* Handle process context switch */
+SEC("tp_btf/sched_switch")
+void BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next)
+{
+    u64 time = bpf_ktime_get_ns();
+    pid_t prev_pid = BPF_CORE_READ(prev, pid);
+    pid_t next_pid = BPF_CORE_READ(next, pid);
+    int prev_tgid = BPF_CORE_READ(prev, tgid);
+    int next_tgid = BPF_CORE_READ(next, tgid);
+    int prev_state = BPF_CORE_READ(prev, __state);
+    int next_state = BPF_CORE_READ(next, __state);
+    char prev_comm[TASK_COMM_LEN], next_comm[TASK_COMM_LEN];
+    struct task_state_node *prev_node, *next_node;
+
+    /* Filter by pid or comm */
+    if (filter.pid) {
+        if (bpf_map_lookup_elem(&target_tgids, &prev_tgid))
+            goto record_prev_info;
+        else if (bpf_map_lookup_elem(&target_tgids, &next_tgid))
+            goto record_next_info;
+        else
+            return;
+    } else if (filter.comm) {
+        get_task_comm(prev_comm, prev);
+        get_task_comm(next_comm, next);
+        if (is_target_comm(prev_comm))
+            goto record_prev_info;
+        else if (is_target_comm(next_comm))
+            goto record_next_info;
+        else
+            return;
+    }
+
+record_prev_info:
+    /* Update prev process running time and state */
+    if (prev_pid > 0) {
+        prev_node = bpf_map_lookup_elem(&task_state_node, &prev_pid);
+        if (prev_node) {
+            if (filter.state < 0 && prev_node->last_state == TASK_RUNNING &&
+                time > prev_node->last_time && prev_pid > 0 && prev_pid != next_pid) {
+                u64 delta = time - prev_node->last_time;
+                if (filter.pid || filter.comm || filter.perins) {
+                    struct state_info *info = update_state_info(prev_pid, TASK_RUNNING, delta);
+                    if (prev_node && filter.latency > 0 && info &&
+                        (info->total > filter.latency || delta > filter.latency))
+                        update_info_to_perf(ctx, prev_node, delta, info->total);
+                    if (!prev_node->has_state_info)
+                        prev_node->has_state_info = true;
+                }
+                if (!(filter.pid || filter.comm || filter.perins))
+                    update_state_info(-1, TASK_RUNNING, delta);
+            }
+            update_task_state_node(prev_node, prev_pid, prev_state, time, prev, ctx);
+        } else if (prev_pid > 0 && prev_pid != next_pid) {
+            struct task_state_node new_node = {
+                .pid = prev_pid,
+                .tgid = prev_tgid,
+                .has_state_info = false,
+            };
+            update_task_state_node(&new_node, prev_pid, prev_state, time, prev, ctx);
+        }
+        if (filter.pid || filter.comm)
+            return;
+    }
+
+record_next_info:
+    /* Update next process RUNDELAY statistics */
+    if (next_pid > 0) {
+        next_node = bpf_map_lookup_elem(&task_state_node, &next_pid);
+        if (next_node) {
+            if (filter.state < 0 && next_node->last_state == TASK_RUNNING &&
+                time > next_node->last_time && next_pid > 0 && prev_pid != next_pid) {
+                u64 delta = time - next_node->last_time;
+                if (filter.pid || filter.comm || filter.perins) {
+                    struct state_info *info = update_state_info(next_pid, RUNDELAY, delta);
+                    if (next_node && filter.latency > 0 && info &&
+                        (info->total > filter.latency || delta > filter.latency))
+                        update_info_to_perf(ctx, next_node, delta, info->total);
+                    if (!next_node->has_state_info)
+                        next_node->has_state_info = true;
+                }
+                if (!(filter.pid || filter.comm))
+                    update_state_info(-1, RUNDELAY, delta);
+            }
+            update_task_state_node(next_node, next_pid, TASK_RUNNING, time, next, ctx);
+        } else if (next_pid > 0 && prev_pid != next_pid) {
+            struct task_state_node new_node = {
+                .pid = next_pid,
+                .tgid = next_tgid,
+                .has_state_info = false,
+            };
+            update_task_state_node(&new_node, next_pid, TASK_RUNNING, time, next, ctx);
+        }
+    }
+}
+
+/* Handle process wakeup (from wait queue to ready queue) */
+SEC("tp_btf/sched_wakeup")
+void BPF_PROG(sched_wakeup, struct task_struct *task)
+{
+    u64 time = bpf_ktime_get_ns();
+    pid_t pid = BPF_CORE_READ(task, pid);
+    int tgid = BPF_CORE_READ(task, tgid);
+    char comm[TASK_COMM_LEN];
+    struct task_state_node *node = NULL;
+
+    /* Filter by pid or comm */
+    if (filter.pid) {
+        if (bpf_map_lookup_elem(&target_tgids, &tgid))
+            goto record_info;
+        else
+            return;
+    } else if (filter.comm) {
+        get_task_comm(comm, task);
+        if (is_target_comm(comm))
+            goto record_info;
+        else
+            return;
+    }
+
+record_info:
+    node = bpf_map_lookup_elem(&task_state_node, &pid);
+    if (pid <= 0)
+        return;
+    if (node) {
+        if (filter.state != TASK_NO_INTERRUPTIBLE &&
+            (filter.state == -1 || (filter.state & node->last_state) == node->last_state) &&
+            node->last_state && time > node->last_time && pid > 0) {
+            u64 delta = time - node->last_time;
+            if (filter.pid || filter.comm || filter.perins) {
+                struct state_info *info = update_state_info(pid, node->last_state, delta);
+                if (node && filter.latency > 0 && info &&
+                    (info->total > filter.latency || delta > filter.latency))
+                    update_info_to_perf(ctx, node, delta, info->total);
+                if (!node->has_state_info)
+                    node->has_state_info = true;
+            }
+            if (!(filter.pid || filter.comm)) {
+                update_state_info(-1, node->last_state, delta);
+            }
+        }
+        update_task_state_node(node, pid, TASK_RUNNING, time, task, ctx);
+    } else {
+        struct task_state_node new_node = {
+            .pid = pid,
+            .tgid = tgid,
+            .last_time = time,
+            .has_state_info = false,
+        };
+        update_task_state_node(&new_node, pid, TASK_RUNNING, time, task, ctx);
+    }
+}
+
+/* Handle new process wakeup (enter ready queue) */
+SEC("tp_btf/sched_wakeup_new")
+void BPF_PROG(sched_wakeup_new, struct task_struct *task)
+{
+    u64 time = bpf_ktime_get_ns();
+    pid_t pid = BPF_CORE_READ(task, pid);
+    int tgid = BPF_CORE_READ(task, tgid);
+    char comm[TASK_COMM_LEN];
+    struct task_state_node *node = NULL;
+
+    /* Filter by pid or comm */
+    if (filter.pid) {
+        if (bpf_map_lookup_elem(&target_tgids, &tgid))
+            goto record_info;
+        else
+            return;
+    } else if (filter.comm) {
+        get_task_comm(comm, task);
+        if (is_target_comm(comm))
+            goto record_info;
+        else
+            return;
+    }
+
+record_info:
+    if (pid <= 0)
+        return;
+    node = bpf_map_lookup_elem(&task_state_node, &pid);
+    if (node) {
+        if (filter.state != TASK_NO_INTERRUPTIBLE &&
+            (filter.state == -1 || (filter.state & node->last_state) == node->last_state) &&
+            node->last_state && time > node->last_time && pid > 0) {
+            u64 delta = time - node->last_time;
+            if (filter.pid || filter.comm || filter.perins) {
+                struct state_info *info = update_state_info(pid, node->last_state, delta);
+                if (node && filter.latency > 0 && info &&
+                    (info->total > filter.latency || delta > filter.latency))
+                    update_info_to_perf(ctx, node, delta, info->total);
+                if (!node->has_state_info)
+                    node->has_state_info = true;
+            }
+            if (!(filter.pid || filter.comm)) {
+                update_state_info(-1, node->last_state, delta);
+            }
+        }
+        update_task_state_node(node, pid, TASK_RUNNING, time, task, ctx);
+    } else {
+        struct task_state_node new_node = {
+            .pid = pid,
+            .tgid = tgid,
+            .last_time = time,
+            .has_state_info = false,
+        };
+        update_task_state_node(&new_node, pid, TASK_RUNNING, time, task, ctx);
+    }
+}
+
+char LICENSE[] SEC("license") = "GPL";
\ No newline at end of file
diff --git a/bpf-skel/task_state.h b/bpf-skel/task_state.h
new file mode 100644
index 0000000..65265ef
--- /dev/null
+++ b/bpf-skel/task_state.h
@@ -0,0 +1,113 @@
+#ifndef __TASK_STATE_H__
+#define __TASK_STATE_H__
+
+#include <asm/types.h>
+#include <linux/version.h>
+
+/* Task comm string length */
+#define TASK_COMM_LEN 16
+
+/* Task state flags (used in tsk->state) */
+#define TASK_RUNNING            0x00000000
+#define TASK_INTERRUPTIBLE      0x00000001
+#define TASK_UNINTERRUPTIBLE    0x00000002
+#define __TASK_STOPPED          0x00000004
+#define __TASK_TRACED           0x00000008
+
+/* Task exit state flags (used in tsk->exit_state) */
+#define EXIT_DEAD               0x00000010
+#define EXIT_ZOMBIE             0x00000020
+#define EXIT_TRACE              (EXIT_ZOMBIE | EXIT_DEAD)
+
+/* Additional task state flags */
+#define TASK_PARKED             0x00000040
+#define TASK_DEAD               0x00000080
+#define TASK_WAKEKILL           0x00000100
+#define TASK_WAKING             0x00000200
+#define TASK_NOLOAD             0x00000400
+#define TASK_NEW                0x00000800
+#define TASK_RTLOCK_WAIT        0x00001000
+#define TASK_FREEZABLE          0x00002000
+#define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
+#define TASK_FROZEN             0x00008000
+#define TASK_STATE_MAX          0x00010000
+#define RUNDELAY                (TASK_STATE_MAX << 1)
+#define TASK_ANY                (TASK_STATE_MAX - 1)
+
+/* Composite state macros */
+#define TASK_FREEZABLE_UNSAFE   (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)
+#define TASK_KILLABLE           (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
+#define TASK_STOPPED            (TASK_WAKEKILL | __TASK_STOPPED)
+#define TASK_TRACED             __TASK_TRACED
+#define TASK_IDLE               (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+#define TASK_NORMAL             (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
+#define TASK_NO_INTERRUPTIBLE   -2
+
+/* Stack and filter limits */
+#define MAX_STACKS              32      // Maximum stack depth
+#define MAX_COMM_FILTER         16      // Maximum number of comm filters
+
+struct filters {
+    bool pid;          // Filter by pid
+    bool comm;         // Filter by comm
+    int comm_num;      // Number of comm filters
+    int state;         // Task state filter
+    uint64_t latency;  // Latency threshold
+    int stack;         // Stack trace enabled
+    bool perins;       // Per-process statistics enabled
+};
+
+/* Key for state statistics map */
+struct state_key {
+    pid_t pid;         // -1 for system, pid for process
+    int state;         // Task state
+};
+
+/* Value for state statistics map */
+struct state_info {
+    int calls;                 // Number of calls
+    unsigned long total;       // Total time spent in this state (us)
+    unsigned long min;         // Minimum time spent in this state (us)
+    unsigned long max;         // Maximum time spent in this state (us)
+    unsigned long p50;         // 50th percentile (us)
+    unsigned long p95;         // 95th percentile (us)
+    unsigned long p99;         // 99th percentile (us)
+};
+
+/* Last state info for a task */
+struct task_last_state {
+    int memused;               // Memory used
+    uint64_t readchar;         // Bytes read
+    uint64_t writechar;        // Bytes written
+    uint64_t freepages_delay;  // Free pages delay
+    uint64_t thrashing_delay;  // Thrashing delay
+    uint64_t swapin_delay;     // Swap-in delay
+};
+
+/* State node for a task */
+struct task_state_node {
+    pid_t pid;                 // Process ID
+    pid_t tgid;                // Thread Group ID
+    char comm[TASK_COMM_LEN];  // Command name
+    unsigned long long last_time; // Timestamp (ns)
+    int last_state;            // Last state
+    int last_user_stack_id;    // Last user stack ID
+    int last_kern_stack_id;    // Last kernel stack ID
+    bool has_state_info;       // State info available
+    int priority;              // Task priority
+    struct task_last_state curr_state_info; // Current state info
+};
+
+/* Stack trace event structure */
+struct stacktrace_event {
+    pid_t pid;                 // Process ID
+    uint64_t latency;          // Latency (ns)
+    uint64_t total_delay;      // Total delay (ns)
+    int state;                 // Task state
+    int last_user_stack_id;    // Last user stack ID
+    int last_kern_stack_id;    // Last kernel stack ID
+    int user_stack_id;         // User stack ID
+    int kern_stack_id;         // Kernel stack ID
+};
+
+#endif /* __TASK_STATE_H__ */
\ No newline at end of file
diff --git a/bpf_task_state.c b/bpf_task_state.c
new file mode 100644
index 0000000..2b2e020
--- /dev/null
+++ b/bpf_task_state.c
@@ -0,0 +1,616 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <monitor.h>
+#include <trace_helpers.h>
+#include <bpf-skel/task_state.h>
+#include <bpf-skel/bpf_task_state.skel.h>
+#include <internal/xyarray.h>
+#include <internal/evsel.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include <stack_helpers.h>
+#include <latency_helpers.h>
+
+struct taskstate_ctx {
+    struct bpf_task_state_bpf *obj;
+    struct perf_thread_map *thread_map;
+    struct perf_evsel *evsel;
+    struct ksyms *ksyms;
+    struct syms_cache *syms_cache;
+};
+
+// static struct ksyms *ksyms = NULL;
+
+/* Initialize monitor context */
+static int monitor_ctx_init(struct prof_dev *dev)
+{
+    struct taskstate_ctx *ctx = zalloc(sizeof(*ctx));
+    if (!ctx)
+        return -1;
+    dev->private = ctx;
+
+    ctx->obj = bpf_task_state_bpf__open();
+    if (!ctx->obj) {
+        printf("Failed to open BPF skeleton\n");
+        goto free_ctx;
+    }
+    return 0;
+free_ctx:
+    free(ctx);
+    return -1;
+}
+
+/* Cleanup monitor context */
+static void monitor_ctx_exit(struct prof_dev *dev)
+{
+    struct taskstate_ctx *ctx = dev->private;
+    bpf_task_state_bpf__detach(ctx->obj);
+    bpf_task_state_bpf__destroy(ctx->obj);
+    free(ctx);
+}
+
+/* Get TGID for a given TID */
+static int get_tgid(int tid)
+{
+    char path[256], line[256];
+    FILE *fp;
+    int pid = 0;
+
+    snprintf(path, sizeof(path), "/proc/%d/status", tid);
+    fp = fopen(path, "r");
+    if (!fp)
+        return -1;
+
+    while (fgets(line, sizeof(line), fp)) {
+        if (sscanf(line, "Tgid: %d", &pid) == 1)
+            break;
+        pid = 0;
+    }
+    fclose(fp);
+
+    return pid;
+}
+
+/* Count number of process names in filter string */
+static int count_process_names(const char *filter_str)
+{
+    int count = 0;
+    char *filter = strdup(filter_str);
+    char *s = filter;
+    char *sep;
+
+    while ((sep = strchr(s, ',')) != NULL) {
+        *sep = '\0';
+        if (*s)
+            count++;
+        s = sep + 1;
+    }
+    if (*s)
+        count++;
+    free(filter);
+    return count;
+}
+
+/* Pattern match for comm string */
+static int comm_pattern_match(const char *comm, const char *pattern)
+{
+    for (int i = 0; i < TASK_COMM_LEN; i++) {
+        char pc = pattern[i];
+        char cc = comm[i];
+        if (pc == '*')           return 1;
+        if (pc == '\0')          return cc == '\0';
+        if (cc == '\0')          return 0;
+        if (pc != cc)            return 0;
+    }
+    return 1;
+}
+
+/* Initialize filter_comms_map with filter string */
+static int init_filter_comms_map(struct bpf_map *filter_comms_map, const char *filter_str)
+{
+    char *filter = strdup(filter_str);
+    char *s;
+    char *sep;
+    char comm[16];
+    int idx = 0;
+    if (!filter) return -1;
+    s = filter;
+    while ((sep = strchr(s, ',')) != NULL && idx < MAX_COMM_FILTER) {
+        *sep = '\0';
+        if (*s) {
+            memset(comm, 0, sizeof(comm));
+            strncpy(comm, s, 15);
+            bpf_map__update_elem(filter_comms_map, &idx, sizeof(idx), comm, sizeof(comm), 0);
+            idx++;
+        }
+        s = sep + 1;
+    }
+    if (*s && idx < MAX_COMM_FILTER) {
+        memset(comm, 0, sizeof(comm));
+        strncpy(comm, s, 15);
+        bpf_map__update_elem(filter_comms_map, &idx, sizeof(idx), comm, sizeof(comm), 0);
+        idx++;
+    }
+    free(filter);
+    return 0;
+}
+
+/* Initialize target_comms_map by scanning /proc and matching filter patterns */
+static int init_target_comms_map(struct bpf_map *filter_comms_map, int filter_cnt, struct bpf_map *target_comms_map)
+{
+    DIR *proc;
+    struct dirent *entry;
+    char comm_path[300], comm[16], pattern[16];
+    bool dummy = true;
+    proc = opendir("/proc");
+    if (!proc) return -1;
+    while ((entry = readdir(proc)) != NULL) {
+        FILE *f;
+        if (!isdigit(entry->d_name[0]))
+            continue;
+        snprintf(comm_path, sizeof(comm_path), "/proc/%s/comm", entry->d_name);
+        f = fopen(comm_path, "r");
+        if (!f) continue;
+        if (!fgets(comm, sizeof(comm), f)) {
+            fclose(f);
+            continue;
+        }
+        fclose(f);
+        comm[strcspn(comm, "\n")] = 0;
+        for (int i = 0; i < filter_cnt; i++) {
+            memset(pattern, 0, sizeof(pattern));
+            if (bpf_map__lookup_elem(filter_comms_map, &i, sizeof(i), pattern, sizeof(pattern), 0) == 0) {
+                if (comm_pattern_match(comm, pattern)) {
+                    bpf_map__update_elem(target_comms_map, comm, sizeof(comm), &dummy, sizeof(dummy), 0);
+                    break;
+                }
+            }
+        }
+    }
+    closedir(proc);
+    return 0;
+}
+
+static int bpf_task_state_init(struct prof_dev *dev)
+{
+    struct taskstate_ctx *ctx;
+    struct env *env = dev->env;
+    struct perf_evlist *evlist = dev->evlist;
+    struct bpf_map *filter_comms_map;
+    struct bpf_map *target_comms_map;
+    int comm_num;
+    struct perf_event_attr attr = {
+        .type          = PERF_TYPE_SOFTWARE,
+        .config        = PERF_COUNT_SW_BPF_OUTPUT,
+        .size          = sizeof(struct perf_event_attr),
+        .sample_period = 1,
+        .sample_type   = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW,
+        .read_format   = 0,
+        .pinned        = 1,
+        .disabled      = 0,
+        .watermark     = 1,
+        .wakeup_watermark = 1,
+    };
+
+    if (monitor_ctx_init(dev) < 0)
+        return -1;
+
+    ctx = dev->private;
+
+    prof_dev_env2attr(dev, &attr);
+    ctx->evsel = perf_evsel__new(&attr);
+    if (!ctx->evsel)
+        goto failed;
+    perf_evlist__add(evlist, ctx->evsel);
+
+    ctx->obj->rodata->filter.pid = env->pids ? true : false;
+    ctx->obj->rodata->filter.comm = env->filter ? true : false;
+    ctx->obj->rodata->filter.perins = env->perins ? true : false;
+    ctx->obj->rodata->filter.latency = env->greater_than ? env->greater_than : ctx->obj->rodata->filter.latency;
+    ctx->obj->rodata->filter.stack = env->callchain ? true : false;
+
+    if (env->interruptible && env->uninterruptible) {
+        ctx->obj->rodata->filter.state = TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE;
+    } else if (env->interruptible) {
+        ctx->obj->rodata->filter.state = TASK_INTERRUPTIBLE;
+    } else if (env->uninterruptible) {
+        ctx->obj->rodata->filter.state = TASK_UNINTERRUPTIBLE;
+    } else if (env->interruptible_set && !env->interruptible){
+        ctx->obj->rodata->filter.state = TASK_NO_INTERRUPTIBLE;
+    }
+
+    if (env->filter) {
+        comm_num = count_process_names(env->filter);
+        ctx->obj->rodata->filter.comm_num = comm_num;
+    }
+
+    if (bpf_task_state_bpf__load(ctx->obj)){
+        fprintf(stderr, "Failed to load and verify BPF skeleton\n");
+        goto failed;
+    }
+
+    filter_comms_map = ctx->obj->maps.filter_comms_map;
+    target_comms_map = ctx->obj->maps.target_comms_map;
+    if (env->filter) {
+        if (init_filter_comms_map(filter_comms_map, env->filter) < 0) {
+            fprintf(stderr, "Failed to initialize filter_comms_map\n");
+            return -1;
+        }
+        if (init_target_comms_map(filter_comms_map, comm_num, target_comms_map) < 0) {
+            fprintf(stderr, "Failed to initialize target_comms_map\n");
+            return -1;
+        }
+    }
+    return 0;
+
+failed:
+    monitor_ctx_exit(dev);
+    return -1;
+}
+
+#define FD(e, x, y) ((int *) xyarray__entry(e->fd, x, y))
+static int bpf_task_state_filter(struct prof_dev *dev)
+{
+    struct taskstate_ctx *ctx = dev->private;
+    struct bpf_map *target_tgids = ctx->obj->maps.target_tgids;
+    int idx, tid, err, cpu, ins, perf_event_fd;
+
+    ctx->thread_map = dev->threads;
+    perf_cpu_map__put(dev->cpus);
+    dev->cpus = perf_cpu_map__new(NULL);
+    dev->threads = perf_thread_map__new_dummy();
+
+    if (ctx->thread_map) {
+        perf_thread_map__for_each_thread(tid, idx, ctx->thread_map) {
+            int tgid = get_tgid(tid);
+            if (tgid > 0) {
+                bpf_map__update_elem(target_tgids, &tgid, sizeof(tgid), &tgid, sizeof(tgid), 0);
+            }
+        }
+    }
+
+    perf_cpu_map__for_each_cpu(cpu, ins, dev->cpus) {
+        tid = 0; idx = 0;
+        perf_thread_map__for_each_thread(tid, idx, ctx->thread_map) {
+            int *fd_ptr;
+            fd_ptr = FD(ctx->evsel, ins, idx);
+            if (!fd_ptr) {
+                continue;
+            }
+            perf_event_fd = *fd_ptr;
+            err = bpf_map__update_elem(ctx->obj->maps.perf_events, &cpu, sizeof(cpu), &perf_event_fd, sizeof(perf_event_fd), 0);
+            if (err < 0) {
+                printf("Failed to update perf_events map\n");
+                return -1;
+            }
+        }
+    }
+
+    ctx->ksyms = ksyms__load();
+    if (!ctx->ksyms) {
+        printf("Failed to create ksyms map\n");
+        return -1;
+    }
+    ctx->syms_cache = syms_cache__new();
+    if (!ctx->syms_cache) {
+        printf("Failed to create syms_cache map\n");
+        ksyms__free(ctx->ksyms);
+        return -1;
+    }
+
+    err = bpf_task_state_bpf__attach(ctx->obj);
+    if (err) {
+        printf("Failed to attach BPF skeleton\n");
+    }
+    return 1;
+}
+
+/* Convert state value to string */
+static const char *state_to_str(int state)
+{
+    switch (state) {
+        case TASK_RUNNING:         return "R";
+        case TASK_INTERRUPTIBLE:   return "S";
+        case TASK_UNINTERRUPTIBLE: return "D";
+        case __TASK_STOPPED:       return "T";
+        case TASK_IDLE:            return "I";
+        case RUNDELAY:             return "RD";
+        default:                   return "?";
+    }
+}
+
+/* Print state statistics for a process */
+static void print_state_info(struct bpf_map *map, pid_t pid)
+{
+    struct state_key key = { .pid = pid };
+    struct state_info value = {0};
+    static const int states[] = {
+        TASK_RUNNING,
+        TASK_INTERRUPTIBLE,
+        TASK_UNINTERRUPTIBLE,
+        __TASK_STOPPED,
+        TASK_IDLE,
+        RUNDELAY,
+    };
+    static const char *state_strs[] = { "R", "S", "D", "T", "I", "RD" };
+    bool has_data = false;
+
+    printf("%3s %10s %12s %12s %12s %12s\n",
+        "St", "calls", "total(us)", "min(us)", "max(us)", "avg(us)");
+    printf("%3s %10s %12s %12s %12s %12s\n",
+        "---", "----------", "------------", "------------", "------------", "------------");
+    for (int i = 0; i < (int)(sizeof(states)/sizeof(states[0])); i++) {
+        key.state = states[i];
+        if (bpf_map__lookup_elem(map, &key, sizeof(key), &value, sizeof(value), 0) == 0 && value.calls > 0) {
+            has_data = true;
+            printf("%3s %10u %12.3f %12.3f %12.3f %12.3f\n",
+                state_strs[i],
+                value.calls,
+                value.total / 1000.0,
+                value.min / 1000.0,
+                value.max / 1000.0,
+                value.calls ? (double)value.total / value.calls / 1000.0 : 0.0
+            );
+            bpf_map__delete_elem(map, &key, sizeof(key), 0);
+        }
+    }
+    if (!has_data)
+        printf("%3s %10s %12s %12s %12s %12s\n",
+            "NULL", "NULL", "NULL", "NULL", "NULL", "NULL");
+    printf("\n");
+}
+
+/* Print kernel symbol for a given address */
+static void print_ksym(__u64 addr, struct taskstate_ctx *ctx)
+{
+    const struct ksym *sym;
+    if (!addr)
+        return;
+    sym = ksyms__map_addr(ctx->ksyms, addr);
+    if (!sym) {
+        printf("    0x%llx Unknown", addr);
+    } else {
+        printf("    0x%llx %s+0x%llx", addr, sym->name, addr - sym->addr);
+    }
+    printf("  ([kernel.kallsysms])\n");
+}
+
+/* Print user symbol for a given address */
+static int print_usym(__u64 addr, pid_t pid, struct taskstate_ctx *ctx)
+{
+    struct syms *syms;
+    struct dso *dso = NULL;
+    const struct sym *sym = NULL;
+    uint64_t offset = 0;
+    const char *symbol = "Unknown";
+    int unknown = 0;
+
+    // 获取用户符号缓存
+    syms = syms_cache__get_syms(ctx->syms_cache, pid);
+    if (syms) {
+        dso = syms__find_dso(syms, addr, &offset);
+        if (dso) {
+            sym = dso__find_sym(dso, offset);
+            if (sym) {
+                symbol = sym__name(sym);
+                offset = offset - sym->start;
+                unknown = dso__name(dso) ? 0 : 1;
+            }
+        } else
+            unknown = 1;
+    }
+    printf("    0x%llx %s+0x%lx (%s)\n", addr, symbol, offset, unknown? "Unknown" : dso__name(dso));
+    return unknown;
+}
+
+/* Print stack trace for a given stack id */
+static void print_stack(int fd, int key, bool kernel_user_stack, pid_t pid, struct taskstate_ctx *ctx)
+{
+    __u64 ip[PERF_MAX_STACK_DEPTH] = {};
+    int i, unknown = 0;
+    if (bpf_map_lookup_elem(fd, &key, &ip) != 0) {
+        printf("not find;\n");
+    } else {
+        if(!kernel_user_stack)
+            for (i = 0; i < PERF_MAX_STACK_DEPTH; i++)
+                print_ksym(ip[i], ctx);
+        else
+            for (i = 0; i < PERF_MAX_STACK_DEPTH; i++){
+                if(unknown < 2)
+                    unknown += print_usym(ip[i], pid, ctx);
+            }
+    }
+}
+
+/* Print all stack traces for a stacktrace_event */
+static void print_stacks(int fd, struct stacktrace_event *info, pid_t pid, struct taskstate_ctx *ctx)
+{
+    if (info->last_kern_stack_id) {
+        printf("Last kernel stack trace:\n");
+        print_stack(fd, info->last_kern_stack_id, 0, pid, ctx);
+    }
+    if (info->last_user_stack_id) {
+        printf("Last user stack trace:\n");
+        print_stack(fd, info->last_user_stack_id, 1, pid, ctx);
+    }
+    if (info->kern_stack_id) {
+        printf("Kernel stack trace:\n");
+        print_stack(fd, info->kern_stack_id, 0, pid, ctx);
+    }
+    if (info->user_stack_id) {
+        printf("User stack trace:\n");
+        print_stack(fd, info->user_stack_id, 1, pid, ctx);
+    }
+}
+
+/* Print statistics for each process at interval */
+static void bpf_task_state_interval(struct prof_dev *dev)
+{
+    struct taskstate_ctx *ctx = dev->private;
+    if (dev->env->pids || dev->env->filter) {
+        pid_t key = -1, next_key = -1;
+        struct task_state_node task_node = {0};
+        int task_state_node_fd = bpf_map__fd(ctx->obj->maps.task_state_node);
+        int last_task_node_fd = bpf_map__fd(ctx->obj->maps.last_task_node);
+        bool has_print_title = false;
+        if (bpf_map_get_next_key(task_state_node_fd, &key, &next_key)) {
+            return;
+        }
+        while (!bpf_map_get_next_key(task_state_node_fd, &key, &next_key)) {
+            if (bpf_map_lookup_elem(task_state_node_fd, &next_key, &task_node) < 0) {
+                printf("Failed to lookup task state node for PID in interval%d\n", next_key);
+            } else {
+                if (task_node.has_state_info) {
+                    struct task_last_state last_task_node = {0};
+                    unsigned long memtotal = sysconf(_SC_PHYS_PAGES);
+                    if (!has_print_title) {
+                        printf("\n======================== Per-Process State Statistics ============================\n");
+                        print_time(stdout);
+                        printf("\n");
+                        has_print_title = true;
+                    }
+                    printf("TGID: %d, PID: %d, COMM: %s, PRIO: %d\n",
+                        task_node.tgid, task_node.pid, task_node.comm, task_node.priority);
+                    bpf_map_lookup_elem(last_task_node_fd, &next_key, &last_task_node);
+                    printf("MEM: %.3f, Read: %.3fkb, Write: %.3fkb\n",
+                        (task_node.curr_state_info.memused * 100.0 / memtotal),
+                        1.0 * (task_node.curr_state_info.readchar - last_task_node.readchar) / 1024,
+                        1.0 * (task_node.curr_state_info.writechar - last_task_node.writechar) / 1024);
+                    if(task_node.curr_state_info.freepages_delay - last_task_node.freepages_delay)
+                        printf("FreePages: %ld ",
+                            task_node.curr_state_info.freepages_delay - last_task_node.freepages_delay);
+                    if(task_node.curr_state_info.thrashing_delay - last_task_node.thrashing_delay)
+                        printf("Thrashing: %ld ",
+                            task_node.curr_state_info.thrashing_delay - last_task_node.thrashing_delay);
+                    if(task_node.curr_state_info.swapin_delay - last_task_node.swapin_delay)
+                        printf("SwapIn: %ld ",
+                            task_node.curr_state_info.swapin_delay - last_task_node.swapin_delay);
+                    if (task_node.curr_state_info.freepages_delay - last_task_node.freepages_delay ||
+                        task_node.curr_state_info.thrashing_delay - last_task_node.thrashing_delay ||
+                        task_node.curr_state_info.swapin_delay - last_task_node.swapin_delay)
+                        printf("\n");
+                    print_state_info(ctx->obj->maps.state_info, task_node.pid);
+
+                    last_task_node.memused = task_node.curr_state_info.memused;
+                    last_task_node.readchar = task_node.curr_state_info.readchar;
+                    last_task_node.writechar = task_node.curr_state_info.writechar;
+                    last_task_node.freepages_delay = task_node.curr_state_info.freepages_delay;
+                    last_task_node.thrashing_delay = task_node.curr_state_info.thrashing_delay;
+                    last_task_node.swapin_delay = task_node.curr_state_info.swapin_delay;
+                    bpf_map_update_elem(last_task_node_fd, &next_key, &last_task_node, BPF_ANY);
+
+                    if ((task_node.last_state & TASK_DEAD) ||
+                        (task_node.last_state & EXIT_DEAD) ||
+                        (task_node.last_state & EXIT_ZOMBIE)) {
+                        printf("[DeadInfo]: %d dead at %u\n",
+                            task_node.pid, (unsigned int)task_node.last_time);
+                        bpf_map_delete_elem(task_state_node_fd, &next_key);
+                    } else if (task_node.last_state & EXIT_ZOMBIE) {
+                        printf("[ZombieInfo]: %d Zombie at %u not dead\n",
+                            task_node.pid, (unsigned int)task_node.last_time);
+                        task_node.has_state_info = false;
+                        bpf_map_update_elem(task_state_node_fd, &next_key, &task_node, BPF_ANY);
+                    } else {
+                        task_node.has_state_info = false;
+                        bpf_map_update_elem(task_state_node_fd, &next_key, &task_node, BPF_ANY);
+                    }
+                }
+            }
+            key = next_key;
+        }
+    } else {
+        printf("========================= SYS-State Statistics ====================\n");
+        print_time(stdout);
+        printf("\n");
+        print_state_info(ctx->obj->maps.state_info, -1);
+        printf("\n");
+    }
+}
+
+/* Print statistics for a process when sample event is triggered */
+static void bpf_task_state_sample(struct prof_dev *dev, union perf_event *event, int instance)
+{
+    struct taskstate_ctx *ctx = dev->private;
+    struct stacktrace_event *info = (void *)event->sample.array + sizeof(u64) + sizeof(u32);
+    struct task_state_node task_node = {0};
+    int task_state_fd = bpf_map__fd(ctx->obj->maps.task_state_node);
+    int stack_map_fd = bpf_map__fd(ctx->obj->maps.stack_map);
+    bool has_print_title = false;
+    if (dev->env->greater_than) {
+        if (bpf_map_lookup_elem(task_state_fd, &info->pid, &task_node) < 0) {
+            printf("Failed to lookup task state node for PID in sample%d\n", info->pid);
+        } else {
+            if (task_node.has_state_info) {
+                if (!has_print_title) {
+                    printf("\n======================== Per-Process State Statistics ============================\n");
+                    print_time(stdout);
+                    printf("\n");
+                    has_print_title = true;
+                }
+                printf("TGID: %d, PID: %d, COMM: %s, STATE: %s, PRIO: %d, latency: %luus, total_delay: %lums\n",
+                    task_node.tgid, task_node.pid, task_node.comm, state_to_str(info->state), task_node.priority,
+                    info->latency / NSEC_PER_USEC, info->total_delay / NSEC_PER_MSEC);
+                print_state_info(ctx->obj->maps.state_info, task_node.pid);
+                if (dev->env->callchain)
+                    print_stacks(stack_map_fd, info, task_node.pid, ctx);
+
+                task_node.has_state_info = false;
+                bpf_map_update_elem(task_state_fd, &info->pid, &task_node, BPF_ANY);
+            }
+        }
+    }
+}
+
+/* Cleanup and print interval statistics */
+static void bpf_task_state_deinit(struct prof_dev *dev)
+{
+    bpf_task_state_interval(dev);
+    monitor_ctx_exit(dev);
+}
+
+static const char *bpf_task_state_desc[] = PROFILER_DESC("bpf:task-state",
+    "[OPTION...] [-S] [-D] [--filter comm] [--perins] [--than ns] [-g [--flame-graph file]]",
+    "Show task state statistics via BPF.",
+    "",
+    "BPF-TASK-STATE",
+    "    int pid         # process id",
+    "    int state       # task state",
+    "    u64 last_time   # last state change time",
+    "",
+    "EXAMPLES",
+    "    "PROGRAME" bpf:task-state -i 1000",
+    "    "PROGRAME" bpf:task-state -p 1234 -i 1000",
+    "    "PROGRAME" bpf:task-state --filter 'java,python*' -i 1000",
+    "    "PROGRAME" bpf:task-state -i 1000 --no-interruptible",
+    "    "PROGRAME" bpf:task-state -p 1234 -SD --than 20ms -g",
+    "    "PROGRAME" bpf:task-state --perins --than 1ms -i 1000"
+);
+
+static const char *bpf_task_state_argv[] = PROFILER_ARGV("bpf:task-state",
+    PROFILER_ARGV_OPTION, "inherit",
+    PROFILER_ARGV_PROFILER, "interruptible", "uninterruptible",
+    "filter\nFilter process comm",
+    "perins", "than", "call-graph", "bpf-python-callchain"
+);
+
+struct monitor bpf_task_state = {
+    .name = "bpf:task-state",
+    .desc = bpf_task_state_desc,
+    .argv = bpf_task_state_argv,
+    .pages = 8,
+    .init = bpf_task_state_init,
+    .filter = bpf_task_state_filter,
+    .deinit = bpf_task_state_deinit,
+    .interval = bpf_task_state_interval,
+    .sample = bpf_task_state_sample,
+};
+
+MONITOR_REGISTER(bpf_task_state)
\ No newline at end of file
diff --git a/tests/test_bpf-task-state.py b/tests/test_bpf-task-state.py
new file mode 100644
index 0000000..1b11b24
--- /dev/null
+++ b/tests/test_bpf-task-state.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+from PerfProf import PerfProf
+from conftest import result_check
+
+def test_task_state_mode0(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256
+    task_state = PerfProf(['bpf:task-state', '-i', '1000'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode1(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep'
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode2(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 -SD
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode3(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 -SD --filter 'python,sh,bash,ls,ps,awk,grep'
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD', '--filter', 'python,sh,bash,ls,ps,awk,grep'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode0_NOS(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 --no-interruptible
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--no-interruptible'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode1_NOS(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep' --no-interruptible
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--no-interruptible'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode1_than(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode1_than_perins(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s --perins
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s', '--perins'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode3_than(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 -SD --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_mode3_than_perins(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 -m 256 -SD --filter 'python,sh,bash,ls,ps,awk,grep' --than 1s --perins
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '-m', '256', '-SD', '--filter', 'python,sh,bash,ls,ps,awk,grep', '--than', '1s', '--perins'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_filter(runtime, memleak_check):
+    # perf-prof bpf:task-state --filter 'java,python*' -S --than 100ms -g
+    task_state = PerfProf(['bpf:task-state', '--filter', 'java,python*', '-S', '--than', '100ms', '-g'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_pthread1(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 --perins -m 64 -- ./pthread --loop 10 --depth 3
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '--perins', '-m', '64', '--', './pthread', '--loop', '10', '--depth', '3'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_pthread2(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 --perins -m 64 -- ./pthread --depth 2
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '--perins', '-m', '64', '--', './pthread', '--depth', '2'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
+
+def test_task_state_pthread3(runtime, memleak_check):
+    # perf-prof bpf:task-state -i 1000 --perins -m 64 -- ./pthread --daemonize --depth 5
+    task_state = PerfProf(['bpf:task-state', '-i', '1000', '--perins', '-m', '64', '--', './pthread', '--daemonize', '--depth', '5'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)

From 2b6464686b35c5963172551dff8883fa2154a4bb Mon Sep 17 00:00:00 2001
From: AlbertXu <albert.xuu@foxmail.com>
Date: Thu, 14 Aug 2025 10:43:30 +0800
Subject: [PATCH 2/4] bpf-pystack: Uses eBPF technology to maintain the call
 stacks of Python threads outputs python call stacks together with
 kernel-stack and user-stacks at appropriate times. Compared to pystack,
 bpf-pystack supports stack maintenance for newly created processes.

Usage examples:
--bpf-python-callchain <libpython_path> (--bpf-python-callchain /usr/local/lib/libpython3.6m.so.1.0)
---
 Build                                |   2 +
 bpf-skel/Build                       |   1 +
 bpf-skel/bpf_pystack.bpf.c           | 209 +++++++++++++++++++++++++++
 bpf-skel/bpf_pystack.h               |  18 +++
 bpf_pystack.c                        | 158 ++++++++++++++++++++
 monitor.c                            |   4 +
 monitor.h                            |   9 +-
 pystack.c                            | 166 ++++++++++++++-------
 task-state.c                         |   2 +-
 tests/test_--bpf-python-callchain.py |  10 ++
 10 files changed, 527 insertions(+), 52 deletions(-)
 create mode 100644 bpf-skel/bpf_pystack.bpf.c
 create mode 100644 bpf-skel/bpf_pystack.h
 create mode 100644 bpf_pystack.c
 create mode 100644 tests/test_--bpf-python-callchain.py

diff --git a/Build b/Build
index 93d72c6..35a84fb 100644
--- a/Build
+++ b/Build
@@ -38,6 +38,8 @@ perf-prof-y += tlbstat.o
 perf-prof-y += list.o
 perf-prof-y += kcore.o
 perf-prof-$(CONFIG_LIBBPF) += bpf_kvm_exit.o
+perf-prof-$(CONFIG_LIBBPF) += bpf_task_state.o
+perf-prof-$(CONFIG_LIBBPF) += bpf_pystack.o
 
 bin-y += perf-prof
 
diff --git a/bpf-skel/Build b/bpf-skel/Build
index 744262d..8d28243 100644
--- a/bpf-skel/Build
+++ b/bpf-skel/Build
@@ -1,2 +1,3 @@
 perf-prof-$(CONFIG_LIBBPF) += kvm_exit.skel.h
 perf-prof-$(CONFIG_LIBBPF) += bpf_task_state.skel.h
+perf-prof-$(CONFIG_LIBBPF) += bpf_pystack.skel.h
\ No newline at end of file
diff --git a/bpf-skel/bpf_pystack.bpf.c b/bpf-skel/bpf_pystack.bpf.c
new file mode 100644
index 0000000..53a3195
--- /dev/null
+++ b/bpf-skel/bpf_pystack.bpf.c
@@ -0,0 +1,209 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>		//包含了BPF 辅助函数
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_pystack.h"
+#include <bpf/usdt.bpf.h>
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
+
+// 只读区定义格式字符串
+const char fmt[] SEC(".rodata") = "%s %s";
+// 用于插入 thread_stack 的全零结构体
+const struct stack_t zero_stack = {.depth = 1};
+
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(key_size, sizeof(u64));
+    __uint(value_size, MAX_STR_LEN);
+    __uint(max_entries, 1024);
+} str_id_to_str SEC(".maps");
+
+// 线程调用栈map，key=tid，value=stack_t
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(key_size, sizeof(u32));
+    __uint(value_size, sizeof(struct stack_t));
+    __uint(max_entries, 10240);
+} thread_stack SEC(".maps");
+
+/*将字符串转换成hash值*/
+static __always_inline u64 fnv1a_hash(const char *str, int len) {
+    u64 hash = 14695981039346656037ULL;
+    #pragma unroll
+    for (int i = 0; i < MAX_STR_LEN; i++) {
+        char c = str[i];
+        if (c == 0)
+            break;
+        hash ^= c;
+        hash *= 1099511628211ULL;
+    }
+    return hash;
+}
+
+/*字符串比较*/
+static __always_inline int my_bpf_strncmp(const char *s1, char *s2, int n)
+{
+    #pragma unroll
+    for (int i = 0; i < n; i++) {
+        char c1 = s1[i];
+        char c2 = s2[i];
+        if (c1 != c2)
+            return 1;
+        if (c1 == '\0')
+            return 0;
+    }
+    return 0;
+}
+
+static __always_inline int fill_func_buf(char *buf, size_t buf_sz, long filename_ptr, long funcname_ptr) {
+    __u64 data[2];
+    data[0] = filename_ptr;
+    data[1] = funcname_ptr;
+    return bpf_snprintf(buf, buf_sz, fmt, data, sizeof(data));
+}
+
+/* 公共：字符串 hash 并查找/插入 map，返回 map value 指针或 NULL */
+static __always_inline char *get_or_insert_funcstr(u64 *hash_out, char *buf, int len) {
+    u64 hash = fnv1a_hash(buf, len);
+    char *filefunc = bpf_map_lookup_elem(&str_id_to_str, &hash);
+    if (!filefunc) {
+        if (len >= MAX_STR_LEN)
+            return NULL;
+        bpf_map_update_elem(&str_id_to_str, &hash, buf, BPF_ANY);
+        *hash_out = hash;
+        return buf;
+    } else {
+        if (my_bpf_strncmp(filefunc, buf, MAX_STR_LEN) != 0){
+            bpf_printk("hash err String mismatch: %s != %s\n", filefunc, buf);
+            return NULL;
+        }
+        *hash_out = hash;
+        return filefunc;
+    }
+}
+
+/* 公共：获取线程栈指针，自动初始化 */
+static __always_inline struct stack_t *get_or_init_stack(u32 tid) {
+    struct stack_t *stack = bpf_map_lookup_elem(&thread_stack, &tid);
+    if (!stack) {
+        bpf_map_update_elem(&thread_stack, &tid, &zero_stack, BPF_ANY);
+        stack = bpf_map_lookup_elem(&thread_stack, &tid);
+        if (!stack) {
+            bpf_printk("Failed to get or init stack for tid %d\n", tid);
+            return NULL;
+        }
+    }
+    return stack;
+}
+
+/* 公共：push 到调用栈 */
+static __always_inline int stack_push(struct stack_t *stack, u64 hash, int lineno, u64 now) {
+    if (stack->depth < MAX_STACK_DEPTH) {
+        stack->pystack[stack->depth].filefunc = hash;
+        stack->pystack[stack->depth].lineno = lineno;
+        stack->pystack[stack->depth].time = now;
+        bpf_printk("[entry] push function_entry: %llu %d depth %d\n",
+            stack->pystack[stack->depth].filefunc,
+            stack->pystack[stack->depth].lineno,
+            stack->depth);
+        stack->depth++;
+        return 0;
+    }
+    return -1;
+}
+
+/* 公共：pop 并校验 hash */
+static __always_inline int stack_pop_and_check(struct stack_t *stack, u64 hash, u64 time) {
+    if (stack->depth > 1) {
+        stack->depth--;
+        int cur_depth = stack->depth;
+        if (cur_depth < 1 || cur_depth >= MAX_STACK_DEPTH) {
+            stack->depth++; // 恢复深度
+            bpf_printk("[return] stack underflow or overflow for tid\n");
+            return -1;
+        }
+        // 用cur_depth做所有数组访问
+        if (cur_depth > 0 && cur_depth < MAX_STACK_DEPTH) {
+            if (stack->pystack[cur_depth].filefunc != hash) {
+                stack->depth++; // 恢复深度
+                bpf_printk("[return] hash %llu != %llu filefunc \n", hash, stack->pystack[cur_depth].filefunc);
+                return -2;
+            }
+            u64 delay = time - stack->pystack[cur_depth].time;
+            if (delay > stack->pystack[0].time) {
+                stack->pystack[0].filefunc = stack->pystack[cur_depth].filefunc;
+                stack->pystack[0].lineno = stack->pystack[cur_depth].lineno;
+                stack->pystack[0].time = delay;
+                bpf_printk("[return] %lu - %lu = %lu \n", time, stack->pystack[cur_depth].time, delay);
+            }
+            bpf_printk("[return] pop sucess function_return: %llu %d delay %llu depth %d \n",
+                stack->pystack[cur_depth].filefunc,
+                stack->pystack[cur_depth].lineno,
+                delay,
+                cur_depth);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+SEC("usdt/python:function__entry")
+int BPF_USDT(probe_function_entry,long filename_ptr,long funcname_ptr,long lineno) {
+    char buf[128];
+    long ret;
+    u64 hash,now;
+    char *filefunc;
+    now = bpf_ktime_get_ns();
+
+    /*1. 将filenme funcname 字符串拼接为buf中*/
+    ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr);
+    if (ret <= 0) return 0;
+
+    //bpf_printk("[entry]:%s %d\n", buf, lineno);
+
+    /*2. 将buf字符串转换为hash数值，并存入str_id_to_str中*/
+    filefunc = get_or_insert_funcstr(&hash, buf, ret);
+    if (!filefunc) return 0; 
+
+    /*3. 获取当前线程thread_stack*/
+    u32 tid = bpf_get_current_pid_tgid();
+    struct stack_t *stack = get_or_init_stack(tid);
+    if (!stack) return 0;
+
+    /*4. push id到调用栈*/
+    if (stack_push(stack, hash, lineno, now) < 0)
+        bpf_printk("[entry] stack overflow for tid %d\n", tid);
+
+    return 0;
+}
+
+SEC("usdt/python:function__return")
+int BPF_USDT(probe_function_return,long filename_ptr,long funcname_ptr,long lineno) {
+    char buf[128];
+    long ret;
+    u64 hash, now;
+    char *filefunc;
+    now = bpf_ktime_get_ns();
+
+    /*1. 将filenme funcname 字符串拼接为buf中*/
+    ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr);
+    if (ret <= 0) return 0;
+
+    //bpf_printk("[return]:%s %d\n", buf, lineno);
+
+    /*2. 将buf字符串转换为hash数值，并存入str_id_to_str中*/
+    filefunc = get_or_insert_funcstr(&hash, buf, ret);
+    if (!filefunc) return 0;
+
+    /*3. 获取当前线程thread_stack*/
+    u32 tid = bpf_get_current_pid_tgid();
+    struct stack_t *stack = get_or_init_stack(tid);
+    if (!stack) return 0;
+
+    /*4. pop id到调用栈*/
+    stack_pop_and_check(stack, hash, now);
+
+    return 0;
+}
+
diff --git a/bpf-skel/bpf_pystack.h b/bpf-skel/bpf_pystack.h
new file mode 100644
index 0000000..f679860
--- /dev/null
+++ b/bpf-skel/bpf_pystack.h
@@ -0,0 +1,18 @@
+#include <asm/types.h>
+#include <linux/version.h>
+
+// 最大调用栈深度
+#define MAX_STACK_DEPTH 32
+// 最大字符串长度
+#define MAX_STR_LEN 128
+
+struct perstack{
+    u64 filefunc;
+    u64 time;
+    int lineno;
+};
+// 线程调用栈结构体
+struct stack_t {
+    u32 depth;
+    struct perstack pystack[MAX_STACK_DEPTH];
+};
\ No newline at end of file
diff --git a/bpf_pystack.c b/bpf_pystack.c
new file mode 100644
index 0000000..a47c244
--- /dev/null
+++ b/bpf_pystack.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <monitor.h>
+#include <internal/xyarray.h>
+#include <internal/evsel.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include <stack_helpers.h>
+#include <latency_helpers.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <bpf-skel/bpf_pystack.h>
+#include <bpf-skel/bpf_pystack.skel.h>
+#include "trace_helpers.h"
+
+struct bpf_pystack_ctx {
+    struct bpf_pystack_bpf *obj;
+    void *fixed_event;
+    struct perf_evsel *evsel;
+    //char binary_path[BINARY_PATH_SIZE];
+};
+static const char *object;
+static int monitor_ctx_init(struct prof_dev *dev)
+{
+    struct bpf_pystack_ctx *ctx = zalloc(sizeof(*ctx));
+    if (!ctx)
+        return -1;
+    dev->private = ctx;
+
+    ctx->obj = bpf_pystack_bpf__open();
+    if (!ctx->obj) {
+        fprintf(stderr, "Failed to open BPF skeleton\n");
+        goto free_ctx;
+    }
+    return 0;
+free_ctx:
+    free(ctx);
+    return -1;
+}
+
+static void monitor_ctx_exit(struct prof_dev *dev)
+{
+    struct bpf_pystack_ctx *ctx = dev->private;
+    bpf_pystack_bpf__destroy(ctx->obj);
+    free(ctx->fixed_event);
+    free(ctx);
+}
+
+static int bpf_pystack_init(struct prof_dev *dev)
+{
+    struct bpf_pystack_ctx *ctx;
+    struct env *env = dev->env;
+    struct perf_evlist *evlist = dev->evlist;
+    struct perf_event_attr attr = {
+        .type          = PERF_TYPE_SOFTWARE,
+        .config        = PERF_COUNT_SW_BPF_OUTPUT, 
+        .size          = sizeof(struct perf_event_attr),
+        .sample_type   = PERF_SAMPLE_TIME, 
+        .sample_period = 1,
+        .pinned        = 1,
+        .disabled      = 1,
+        .watermark     = 1,
+    };
+    if (monitor_ctx_init(dev) < 0)
+        return -1;
+    ctx = dev->private;
+    prof_dev_env2attr(dev, &attr);
+
+    ctx->evsel = perf_evsel__new(&attr);
+    if (!ctx->evsel)
+        goto failed;
+
+    perf_evlist__add(evlist, ctx->evsel);
+    if (bpf_pystack_bpf__load(ctx->obj))
+        goto failed;
+
+    ctx->fixed_event = malloc(PERF_SAMPLE_MAX_SIZE);
+    if (!ctx->fixed_event){
+        printf("Failed to allocate fixed_event buffer\n");
+        goto deinit;
+    }
+    object = env->bpf_python_callchain;
+    printf("bpf_pystack object: %s\n", object);
+    return 0;
+
+failed:
+    monitor_ctx_exit(dev);
+deinit:
+    monitor_ctx_exit(dev);
+    return -1;
+}
+
+static int bpf_pystack_filter(struct prof_dev *dev)
+{
+    struct bpf_pystack_ctx *ctx = dev->private;
+    int err;
+
+    ctx->obj->links.probe_function_entry = bpf_program__attach_usdt(ctx->obj->progs.probe_function_entry, -1,
+								object, "python", "function__entry", NULL);
+	if (!ctx->obj->links.probe_function_entry) {
+		err = errno;
+		fprintf(stderr, "attach usdt probe_function_entry failed: %s\n", strerror(err));
+	}
+
+    ctx->obj->links.probe_function_return = bpf_program__attach_usdt(ctx->obj->progs.probe_function_return, -1,
+								object, "python", "function__return", NULL);   
+	if (!ctx->obj->links.probe_function_return) {
+		err = errno;
+		fprintf(stderr, "attach usdt probe_function__return failed: %s\n", strerror(err));
+	}                             
+    return 1;
+}
+
+struct monitor bpf_pystack = {
+    .name = "bpf-pystack",
+    .pages = 4,
+    .init = bpf_pystack_init,
+    .filter = bpf_pystack_filter,
+    .deinit = monitor_ctx_exit,
+};
+MONITOR_REGISTER(bpf_pystack)
+
+int bpf_pystack_link(struct prof_dev *main_dev)
+{
+    struct prof_dev *bpf_pydev;
+    struct env *e;
+    if (main_dev->prof == &bpf_pystack)
+        return 0;
+    e = clone_env(main_dev->env);
+    if (!e)
+        return -1;
+
+    // Specifies the parent of pydev so that the real main_dev can be found
+    // when heap-sorting pydev events. See order_main_dev().
+    bpf_pydev = prof_dev_open_cpu_thread_map(&bpf_pystack, e, NULL, NULL, main_dev);
+    if (!bpf_pydev){
+        printf("fail to open bpf_pystack device\n");
+        return -1;
+    }
+        
+    if (order_together(main_dev, bpf_pydev) < 0) {
+        prof_dev_close(bpf_pydev);
+        return -1;
+    }
+    main_dev->links.pystack = bpf_pydev;
+    return 0;
+}
+
+void bpf_pystack_unlink(struct prof_dev *main_dev)
+{
+    if (main_dev->links.pystack) {
+        prof_dev_close(main_dev->links.pystack);
+        main_dev->links.pystack = NULL;
+    }
+}
diff --git a/monitor.c b/monitor.c
index a2dcf4c..af32636 100644
--- a/monitor.c
+++ b/monitor.c
@@ -469,6 +469,7 @@ struct option main_options[] = {
     OPT_BOOLEAN_SET ( 0 ,   "user-callchain", &env.user_callchain,   &env.user_callchain_set,   "include user callchains, no- prefix to exclude"),
     OPT_BOOLEAN_SET ( 0 , "kernel-callchain", &env.kernel_callchain, &env.kernel_callchain_set, "include kernel callchains, no- prefix to exclude"),
     OPT_BOOL_NONEG  ( 0 , "python-callchain", &env.python_callchain,                            "include python callchains"),
+    OPT_STRDUP_NONEG(0, "bpf-python-callchain", &env.bpf_python_callchain, "path of your libpython (/usr/lib64/libpython3.6m.so.1.0)", "include python callchains by bpf, extra string"),
     OPT_INT_OPTARG_SET( 0 ,    "irqs_disabled", &env.irqs_disabled,    &env.irqs_disabled_set,    1, "0|1",  "ebpf, irqs disabled or not."),
     OPT_INT_OPTARG_SET( 0 , "tif_need_resched", &env.tif_need_resched, &env.tif_need_resched_set, 1, "0|1",  "ebpf, TIF_NEED_RESCHED is set or not."),
     OPT_INT_NONEG_SET ( 0 ,      "exclude_pid", &env.exclude_pid,      &env.exclude_pid_set,         "pid",  "ebpf, exclude pid"),
@@ -2196,6 +2197,9 @@ struct prof_dev *prof_dev_open_internal(profiler *prof, struct env *env,
         if (env->python_callchain)
             if (pystack_link(dev) < 0)
                 goto out_order_deinit;
+        if (env->bpf_python_callchain)
+            if (bpf_pystack_link(dev) < 0)
+                goto out_order_deinit;        
     }
 
     if (dev->env->interval) {
diff --git a/monitor.h b/monitor.h
index e2c206d..ce4c263 100644
--- a/monitor.h
+++ b/monitor.h
@@ -139,6 +139,8 @@ struct env {
     bool user_callchain, user_callchain_set;
     bool kernel_callchain, kernel_callchain_set;
     bool python_callchain;
+    char *bpf_python_callchain;
+    //bool bpf_python_callchain;
     // ebpf
     bool irqs_disabled_set, tif_need_resched_set, exclude_pid_set;
     bool nr_running_min_set, nr_running_max_set;
@@ -577,11 +579,11 @@ perfclock_t prof_dev_list_minevtime(void);
     "usage-self", "sampling-limit", "perfeval-cpus", "perfeval-pids", "version", "verbose", "quiet", "help"
 #define PROFILER_ARGV_FILTER \
     "FILTER OPTION:", \
-    "exclude-host", "exclude-guest", "exclude-user", "exclude-kernel", "python-callchain", \
+    "exclude-host", "exclude-guest", "exclude-user", "exclude-kernel", "python-callchain", "bpf-python-callchain",\
     "user-callchain", "kernel-callchain", \
     "irqs_disabled", "tif_need_resched", "exclude_pid", "nr_running_min", "nr_running_max"
 #define PROFILER_ARGV_CALLCHAIN_FILTER \
-        "FILTER OPTION:", "user-callchain", "kernel-callchain", "python-callchain"
+        "FILTER OPTION:", "user-callchain", "kernel-callchain", "python-callchain", "bpf-python-callchain"
 #define PROFILER_ARGV_PROFILER \
     "PROFILER OPTION:" \
 
@@ -708,4 +710,7 @@ void pystack_unlink(struct prof_dev *main_dev);
 union perf_event *
 pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *writable, int reserved);
 
+//bpf_pystack.c
+int bpf_pystack_link(struct prof_dev *main_dev);
+void bpf_pystack_unlink(struct prof_dev *main_dev);
 #endif
diff --git a/pystack.c b/pystack.c
index 244d9b8..9387b17 100644
--- a/pystack.c
+++ b/pystack.c
@@ -5,6 +5,10 @@
 #include <linux/rblist.h>
 #include <monitor.h>
 #include <stack_helpers.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include <bpf-skel/bpf_pystack.h>
+#include <bpf-skel/bpf_pystack.skel.h>
 
 #define PYSTACK_MAX_DEPTH 128
 
@@ -248,13 +252,25 @@ static void pystack_sample(struct prof_dev *dev, union perf_event *event, int in
         } __packed raw;
     } *data = (void *)event->sample.array;
     unsigned short common_type = data->raw.common_type;
-    struct function__entry_return *function = &data->raw.function;
-    const char *filename = (const char *)function + function->filename_offset;
-    const char *funcname = (const char *)function + function->funcname_offset;
-    int lineno = function->lineno;
+    // struct function__entry_return *function = &data->raw.function;
+    // const char *filename = (const char *)function + function->filename_offset;
+    // const char *funcname = (const char *)function + function->funcname_offset;
+    // int lineno = function->lineno;
+    // struct pystack_node tmp, *node;
+    // struct rb_node *rbn;
+    // char buf[4096];
+    struct function__entry_return function_tmp;
+    struct function__entry_return *function;
+    const char *filename,*funcname;
+    int lineno;
     struct pystack_node tmp, *node;
     struct rb_node *rbn;
     char buf[4096];
+    memcpy(&function_tmp, &data->raw.function, sizeof(function_tmp));
+    function = &function_tmp;
+    filename = (const char *)function + function->filename_offset;
+    funcname = (const char *)function + function->funcname_offset;
+    lineno = function->lineno;
 
     if (!function->common_pid)
         return;
@@ -267,9 +283,11 @@ static void pystack_sample(struct prof_dev *dev, union perf_event *event, int in
     node = rb_entry_safe(rbn, struct pystack_node, rbnode);
     if (node) {
         int n = snprintf(buf, sizeof(buf), "%s (%s:%d)", funcname, filename, lineno);
+        printf("[pystack in stack] %s,%s,%d\n", funcname, filename, lineno);
         if (common_type == ctx->fun_entry) {
             if (node->depth < PYSTACK_MAX_DEPTH) {
                 const char *str = unique_string_len(buf, n>sizeof(buf) ? sizeof(buf)-1 : n);
+                //printf("[pystack in stack] %s\n", str);
                 node->stack[node->depth] = str;
             }
             node->depth++;
@@ -335,9 +353,6 @@ union perf_event *
 pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *writable, int reserved)
 {
     struct prof_dev *pydev = main_dev->links.pystack;
-    struct pystack_ctx *ctx;
-    struct pystack_node tmp, *node;
-    struct rb_node *rbn;
     void *data;
     int pid;
     bool callchain;
@@ -348,10 +363,11 @@ pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *wri
         return event;
 
     data = (void *)event->sample.array;
+
     pid = *(u32 *)(data + main_dev->pos.tid_pos + sizeof(u32));
     if (pid == 0)
         return event;
-
+        
     callchain = main_dev->env->callchain;
     if (!callchain && main_dev->pos.id_pos >= 0) {
         struct perf_evsel *evsel;
@@ -362,47 +378,99 @@ pystack_perf_event(struct prof_dev *main_dev, union perf_event *event, bool *wri
     if (!callchain)
         return event;
 
-    ctx = pydev->private;
-    tmp.pid = pid;
-    rbn = rblist__find(&ctx->pystack, &tmp);
-    node = rb_entry_safe(rbn, struct pystack_node, rbnode);
-    if (node) {
-        union perf_event *new_event = ctx->fixed_event + reserved;
-        struct callchain *cc = data + main_dev->pos.callchain_pos;
-        int copy_len = (void *)&cc->ips[cc->nr] - (void *)event;
-        int depth = node->depth;
-        int d;
-
-        /*
-         * Generate new events: put PERF_CONTEXT_PYSTACK at the end of
-         * the callchain.
-         *
-         * { u64   nr,
-         *   u64   ips[nr]; } && PERF_SAMPLE_CALLCHAIN
-         *
-         * Default callchain context order:
-         *   PERF_CONTEXT_KERNEL
-         *   PERF_CONTEXT_USER
-         *   PERF_CONTEXT_PYSTACK => Contains only the unique string:
-         *                           "funcname (filename:lineno)"
-         */
-        memcpy(new_event, event, copy_len);
-        data = (void *)new_event->sample.array;
-        cc = data + main_dev->pos.callchain_pos;
-        cc->ips[cc->nr++] = PERF_CONTEXT_PYSTACK;
-
-        if (depth > PYSTACK_MAX_DEPTH)
-            depth = PYSTACK_MAX_DEPTH;
-        d = depth;
-        while (depth-- > 0)
-            cc->ips[cc->nr++] = (u64)node->stack[depth];
-
-        if (event->header.size > copy_len)
-            memcpy((void *)&cc->ips[cc->nr], (void *)event + copy_len, event->header.size - copy_len);
-
-        new_event->header.size += (d+1) * sizeof(u64);
-        *writable = 1;
-        return new_event;
+    if(main_dev->env->python_callchain){
+        struct pystack_ctx *ctx;
+        struct pystack_node tmp, *node;
+        struct rb_node *rbn;
+        tmp.pid = pid;
+        ctx = pydev->private;
+        rbn = rblist__find(&ctx->pystack, &tmp);
+        node = rb_entry_safe(rbn, struct pystack_node, rbnode);
+        if (node) {
+            union perf_event *new_event = ctx->fixed_event + reserved;
+            struct callchain *cc = data + main_dev->pos.callchain_pos;
+            int copy_len = (void *)&cc->ips[cc->nr] - (void *)event;
+            int depth = node->depth;
+            int d;
+
+            memcpy(new_event, event, copy_len);
+            data = (void *)new_event->sample.array;
+            cc = data + main_dev->pos.callchain_pos;
+            cc->ips[cc->nr++] = PERF_CONTEXT_PYSTACK;
+
+            if (depth > PYSTACK_MAX_DEPTH)
+                depth = PYSTACK_MAX_DEPTH;
+            d = depth;
+
+            while (depth-- > 0)
+                cc->ips[cc->nr++] = (u64)node->stack[depth];
+
+            if (event->header.size > copy_len)
+                memcpy((void *)&cc->ips[cc->nr], (void *)event + copy_len, event->header.size - copy_len);
+            new_event->header.size += (d+1) * sizeof(u64);
+            *writable = 1;
+            return new_event;
+        }
+    }
+    else if(main_dev->env->bpf_python_callchain){
+        struct bpf_pystack_ctx {
+            struct bpf_pystack_bpf *obj;
+            void *fixed_event;
+        } *ctx = pydev->private;
+        struct stack_t node = {0};
+
+        int str_id_to_str_fd, thread_stack_fd;
+        str_id_to_str_fd = bpf_map__fd(ctx->obj->maps.str_id_to_str);
+        thread_stack_fd = bpf_map__fd(ctx->obj->maps.thread_stack);
+
+        if(bpf_map_lookup_elem(thread_stack_fd, &pid, &node)<0){
+            return event;
+        }else if(node.depth <=0 || node.depth > PYSTACK_MAX_DEPTH){ 
+            printf("tid %d python callchain depth is %d, which is invalid faild, skip this event.\n", pid,node.depth);
+            return event;
+        }else{
+            union perf_event *new_event = ctx->fixed_event + reserved;
+            struct callchain *cc = data + main_dev->pos.callchain_pos;
+            int copy_len = (void *)&cc->ips[cc->nr] - (void *)event;
+            int depth = node.depth;
+            int d;
+
+            if (copy_len <= 0 || copy_len > event->header.size) {
+                printf("Invalid copy_len: %d, event->header.size: %u\n", copy_len, event->header.size);
+                return event;
+            }
+            memcpy(new_event, event, copy_len);
+            data = (void *)new_event->sample.array;
+            cc = data + main_dev->pos.callchain_pos;
+            cc->ips[cc->nr++] = PERF_CONTEXT_PYSTACK;
+
+            if (depth > PYSTACK_MAX_DEPTH)
+                depth = PYSTACK_MAX_DEPTH;
+            d = depth;
+            while (depth-- > 0){
+                struct perstack *perstack = &node.pystack[depth];
+                char filefunc[MAX_STR_LEN] = {0};
+                char buf[4096];
+                const char *str;
+                int len, ret = bpf_map_lookup_elem(str_id_to_str_fd, &perstack->filefunc, filefunc);
+                if (ret<0) {
+                    printf("fail to find value from str_id_to_str_fd: %lu\n", perstack->filefunc);
+                    continue;
+                }
+                if(depth == 0)
+                    len = snprintf(buf, sizeof(buf), "%s:%d (%.2fms) [Max dealy, not in current stack]", filefunc, perstack->lineno, 
+                        (double)(perstack->time/1000000.00));
+                else
+                    len = snprintf(buf, sizeof(buf), "%s: %d", filefunc, perstack->lineno);
+                str = unique_string_len(buf, len>sizeof(buf) ? sizeof(buf)-1 : len);
+                cc->ips[cc->nr++] = (u64)str;
+            }
+            if (event->header.size > copy_len)
+                memcpy((void *)&cc->ips[cc->nr], (void *)event + copy_len, event->header.size - copy_len);
+            new_event->header.size += (d+1) * sizeof(u64);
+            *writable = 1;
+            return new_event;                
+        }
     }
     return event;
 }
diff --git a/task-state.c b/task-state.c
index 506477f..f2a66de 100644
--- a/task-state.c
+++ b/task-state.c
@@ -972,7 +972,7 @@ static const char *task_state_argv[] = PROFILER_ARGV("task-state",
     PROFILER_ARGV_OPTION,
     PROFILER_ARGV_CALLCHAIN_FILTER,
     PROFILER_ARGV_PROFILER, "interruptible", "uninterruptible", "than", "filter\nFilter process comm",
-    "perins", "call-graph", "flame-graph", "ptrace");
+    "perins", "call-graph", "flame-graph", "ptrace", "bpf-python-callchain", "python-callchain");
 struct monitor task_state = {
     .name = "task-state",
     .desc = task_state_desc,
diff --git a/tests/test_--bpf-python-callchain.py b/tests/test_--bpf-python-callchain.py
new file mode 100644
index 0000000..532c9b7
--- /dev/null
+++ b/tests/test_--bpf-python-callchain.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+from PerfProf import PerfProf
+from conftest import result_check
+
+def test_bpf_pystack_taskstate(runtime, memleak_check):
+    # perf-prof task-state --filter 'java,python*' -i 1000 --than 100ms -g --bpf-python-callchain /usr/local/lib/libpython3.6m.so.1.0
+    task_state = PerfProf(['task-state', '--filter', 'java,python*', '-i', '1000', '--than', '100ms', '-g', '--bpf-python-callchain', '/usr/local/lib/libpython3.6m.so.1.0'])
+    for std, line in task_state.run(runtime, memleak_check):
+        result_check(std, line, runtime, memleak_check)
\ No newline at end of file

From 6d4c0ebdb4ab0d6b487b1589d9e3879cbac208fd Mon Sep 17 00:00:00 2001
From: albertxu216 <1418106416@qq.com>
Date: Thu, 14 Aug 2025 10:48:23 +0800
Subject: [PATCH 3/4] bpf-pystack: Uses eBPF technology to maintain the call
 stacks of Python threads

bpf-pystack: Uses eBPF technology to maintain the call stacks of Python threads
Outputs python-stack together with kernel and user call stacks at appropriate times.
Compared to pystack, bpf-pystack supports stack maintenance for newly created processes.
---
 bpf-skel/bpf_pystack.bpf.c | 50 ++++----------------------------------
 1 file changed, 5 insertions(+), 45 deletions(-)

diff --git a/bpf-skel/bpf_pystack.bpf.c b/bpf-skel/bpf_pystack.bpf.c
index 53a3195..bd32b4e 100644
--- a/bpf-skel/bpf_pystack.bpf.c
+++ b/bpf-skel/bpf_pystack.bpf.c
@@ -1,5 +1,5 @@
 #include "vmlinux.h"
-#include <bpf/bpf_helpers.h>		//包含了BPF 辅助函数
+#include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_pystack.h"
@@ -7,9 +7,7 @@
 
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
 
-// 只读区定义格式字符串
 const char fmt[] SEC(".rodata") = "%s %s";
-// 用于插入 thread_stack 的全零结构体
 const struct stack_t zero_stack = {.depth = 1};
 
 struct {
@@ -19,7 +17,6 @@ struct {
     __uint(max_entries, 1024);
 } str_id_to_str SEC(".maps");
 
-// 线程调用栈map，key=tid，value=stack_t
 struct {
     __uint(type, BPF_MAP_TYPE_HASH);
     __uint(key_size, sizeof(u32));
@@ -27,7 +24,6 @@ struct {
     __uint(max_entries, 10240);
 } thread_stack SEC(".maps");
 
-/*将字符串转换成hash值*/
 static __always_inline u64 fnv1a_hash(const char *str, int len) {
     u64 hash = 14695981039346656037ULL;
     #pragma unroll
@@ -41,7 +37,6 @@ static __always_inline u64 fnv1a_hash(const char *str, int len) {
     return hash;
 }
 
-/*字符串比较*/
 static __always_inline int my_bpf_strncmp(const char *s1, char *s2, int n)
 {
     #pragma unroll
@@ -63,7 +58,6 @@ static __always_inline int fill_func_buf(char *buf, size_t buf_sz, long filename
     return bpf_snprintf(buf, buf_sz, fmt, data, sizeof(data));
 }
 
-/* 公共：字符串 hash 并查找/插入 map，返回 map value 指针或 NULL */
 static __always_inline char *get_or_insert_funcstr(u64 *hash_out, char *buf, int len) {
     u64 hash = fnv1a_hash(buf, len);
     char *filefunc = bpf_map_lookup_elem(&str_id_to_str, &hash);
@@ -74,60 +68,46 @@ static __always_inline char *get_or_insert_funcstr(u64 *hash_out, char *buf, int
         *hash_out = hash;
         return buf;
     } else {
-        if (my_bpf_strncmp(filefunc, buf, MAX_STR_LEN) != 0){
-            bpf_printk("hash err String mismatch: %s != %s\n", filefunc, buf);
+        if (my_bpf_strncmp(filefunc, buf, MAX_STR_LEN) != 0)
             return NULL;
-        }
         *hash_out = hash;
         return filefunc;
     }
 }
 
-/* 公共：获取线程栈指针，自动初始化 */
 static __always_inline struct stack_t *get_or_init_stack(u32 tid) {
     struct stack_t *stack = bpf_map_lookup_elem(&thread_stack, &tid);
     if (!stack) {
         bpf_map_update_elem(&thread_stack, &tid, &zero_stack, BPF_ANY);
         stack = bpf_map_lookup_elem(&thread_stack, &tid);
-        if (!stack) {
-            bpf_printk("Failed to get or init stack for tid %d\n", tid);
+        if (!stack) 
             return NULL;
-        }
     }
     return stack;
 }
 
-/* 公共：push 到调用栈 */
 static __always_inline int stack_push(struct stack_t *stack, u64 hash, int lineno, u64 now) {
     if (stack->depth < MAX_STACK_DEPTH) {
         stack->pystack[stack->depth].filefunc = hash;
         stack->pystack[stack->depth].lineno = lineno;
         stack->pystack[stack->depth].time = now;
-        bpf_printk("[entry] push function_entry: %llu %d depth %d\n",
-            stack->pystack[stack->depth].filefunc,
-            stack->pystack[stack->depth].lineno,
-            stack->depth);
         stack->depth++;
         return 0;
     }
     return -1;
 }
 
-/* 公共：pop 并校验 hash */
 static __always_inline int stack_pop_and_check(struct stack_t *stack, u64 hash, u64 time) {
     if (stack->depth > 1) {
         stack->depth--;
         int cur_depth = stack->depth;
         if (cur_depth < 1 || cur_depth >= MAX_STACK_DEPTH) {
-            stack->depth++; // 恢复深度
-            bpf_printk("[return] stack underflow or overflow for tid\n");
+            stack->depth++;
             return -1;
         }
-        // 用cur_depth做所有数组访问
         if (cur_depth > 0 && cur_depth < MAX_STACK_DEPTH) {
             if (stack->pystack[cur_depth].filefunc != hash) {
-                stack->depth++; // 恢复深度
-                bpf_printk("[return] hash %llu != %llu filefunc \n", hash, stack->pystack[cur_depth].filefunc);
+                stack->depth++;
                 return -2;
             }
             u64 delay = time - stack->pystack[cur_depth].time;
@@ -135,13 +115,7 @@ static __always_inline int stack_pop_and_check(struct stack_t *stack, u64 hash,
                 stack->pystack[0].filefunc = stack->pystack[cur_depth].filefunc;
                 stack->pystack[0].lineno = stack->pystack[cur_depth].lineno;
                 stack->pystack[0].time = delay;
-                bpf_printk("[return] %lu - %lu = %lu \n", time, stack->pystack[cur_depth].time, delay);
             }
-            bpf_printk("[return] pop sucess function_return: %llu %d delay %llu depth %d \n",
-                stack->pystack[cur_depth].filefunc,
-                stack->pystack[cur_depth].lineno,
-                delay,
-                cur_depth);
             return 1;
         }
     }
@@ -156,22 +130,16 @@ int BPF_USDT(probe_function_entry,long filename_ptr,long funcname_ptr,long linen
     char *filefunc;
     now = bpf_ktime_get_ns();
 
-    /*1. 将filenme funcname 字符串拼接为buf中*/
     ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr);
     if (ret <= 0) return 0;
 
-    //bpf_printk("[entry]:%s %d\n", buf, lineno);
-
-    /*2. 将buf字符串转换为hash数值，并存入str_id_to_str中*/
     filefunc = get_or_insert_funcstr(&hash, buf, ret);
     if (!filefunc) return 0; 
 
-    /*3. 获取当前线程thread_stack*/
     u32 tid = bpf_get_current_pid_tgid();
     struct stack_t *stack = get_or_init_stack(tid);
     if (!stack) return 0;
 
-    /*4. push id到调用栈*/
     if (stack_push(stack, hash, lineno, now) < 0)
         bpf_printk("[entry] stack overflow for tid %d\n", tid);
 
@@ -186,24 +154,16 @@ int BPF_USDT(probe_function_return,long filename_ptr,long funcname_ptr,long line
     char *filefunc;
     now = bpf_ktime_get_ns();
 
-    /*1. 将filenme funcname 字符串拼接为buf中*/
     ret = fill_func_buf(buf, sizeof(buf), filename_ptr, funcname_ptr);
     if (ret <= 0) return 0;
-
-    //bpf_printk("[return]:%s %d\n", buf, lineno);
-
-    /*2. 将buf字符串转换为hash数值，并存入str_id_to_str中*/
     filefunc = get_or_insert_funcstr(&hash, buf, ret);
     if (!filefunc) return 0;
 
-    /*3. 获取当前线程thread_stack*/
     u32 tid = bpf_get_current_pid_tgid();
     struct stack_t *stack = get_or_init_stack(tid);
     if (!stack) return 0;
 
-    /*4. pop id到调用栈*/
     stack_pop_and_check(stack, hash, now);
-
     return 0;
 }
 

From a5d0b22f5aa3c19a11774494b5d27c4f8bb4f6f3 Mon Sep 17 00:00:00 2001
From: albertxu216 <1418106416@qq.com>
Date: Thu, 14 Aug 2025 10:48:48 +0800
Subject: [PATCH 4/4] Update bpf_pystack.h

---
 bpf-skel/bpf_pystack.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/bpf-skel/bpf_pystack.h b/bpf-skel/bpf_pystack.h
index f679860..eb0526f 100644
--- a/bpf-skel/bpf_pystack.h
+++ b/bpf-skel/bpf_pystack.h
@@ -1,9 +1,7 @@
 #include <asm/types.h>
 #include <linux/version.h>
 
-// 最大调用栈深度
 #define MAX_STACK_DEPTH 32
-// 最大字符串长度
 #define MAX_STR_LEN 128
 
 struct perstack{
@@ -11,8 +9,7 @@ struct perstack{
     u64 time;
     int lineno;
 };
-// 线程调用栈结构体
 struct stack_t {
     u32 depth;
     struct perstack pystack[MAX_STACK_DEPTH];
-};
\ No newline at end of file
+};