BCC with libbpf

BCC all in one

Posted by pandaychen on February 22, 2025

0x00 前言

测试机内核版本5.4.119-1-tlinux4-0008,python版本3.6.8

代码来源:

0x01 基础

0x02 文件系统相关

filetop

TID     COMM             READS  WRITES R_Kb    W_Kb    T FILE
1399240 sap1002          4      0      20479   0       R sockstat
3274711 sap1008          7      0      4900    0       R execve_info
3130102 clear            2      0      60      0       R xterm-256color
3130102 sh               2      0      8       0       R locale.alias
3130102 sh               14     0      5       0       R libc-2.28.so
3130102 sh               4      0      4       0       R cmdline
3130102 sh               4      0      4       0       R cmdline
3130100 filetop          2      0      2       0       R loadavg
3130102 sh               2      0      2       0       R cmdline
3130102 sh               2      0      2       0       R cmdline
3130102 sh               2      0      2       0       R cmdline
3130102 sh               6      0      1       0       R libtinfo.so.6.1
3130102 sh               2      0      1       0       R libdl-2.28.so
3130102 sh               2      0      1       0       R libonion_security.so.1.0.19
3130102 sh               2      0      1       0       R libonion_block_security.so.1.0.16
3130102 sh               2      0      1       0       R stat
3130102 sh               2      0      1       0       R stat
3130102 filetop          4      0      1       0       R ld-2.28.so
764743  sa1009          2      0      1       0       R stat
764737  sa1005          2      0      1       0       R stat

和python版本的类似,仅记录文件读写类型,读写失败时也将被记录。追踪kprobe/vfs_readkprobe/vfs_write,记录文件操作类型、读写大小、文件名等信息

SEC("kprobe/vfs_read")
int BPF_KPROBE(vfs_read_entry, struct file *file, char *buf, size_t count, loff_t *pos)
{
        return probe_entry(ctx, file, count, READ);
}

SEC("kprobe/vfs_write")
int BPF_KPROBE(vfs_write_entry, struct file *file, const char *buf, size_t count, loff_t *pos)
{
        return probe_entry(ctx, file, count, WRITE);
}

bpf hash表entries的key/value都是结构体:

struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(max_entries, MAX_ENTRIES);
        __type(key, struct file_id);
        __type(value, struct file_stat);
} entries SEC(".maps");

struct file_id {
        __u64 inode;
        __u32 dev;
        __u32 rdev;
        __u32 pid;
        __u32 tid;
};

struct file_stat {
        __u64 reads;
        __u64 read_bytes;
        __u64 writes;
        __u64 write_bytes;
        __u32 pid;
        __u32 tid;
        char filename[PATH_MAX];
        char comm[TASK_COMM_LEN];
        char type;
};

主要实现逻辑较为直观:

// 获取dentry name
static void get_file_path(struct file *file, char *buf, size_t size)
{
        struct qstr dname;
        //先读dnanme,再读取dname.name
        dname = BPF_CORE_READ(file, f_path.dentry, d_name);
        bpf_probe_read_kernel(buf, size, dname.name);
}

static int probe_entry(struct pt_regs *ctx, struct file *file, size_t count, enum op op)
{
        __u64 pid_tgid = bpf_get_current_pid_tgid();
        __u32 pid = pid_tgid >> 32;
        __u32 tid = (__u32)pid_tgid;
        int mode;
        struct file_id key = {};
        struct file_stat *valuep;

        if (target_pid && target_pid != pid)
                return 0;

        mode = BPF_CORE_READ(file, f_inode, i_mode);
        if (regular_file_only && !S_ISREG(mode))
                return 0;

        key.dev = BPF_CORE_READ(file, f_inode, i_sb, s_dev);
        key.rdev = BPF_CORE_READ(file, f_inode, i_rdev);
        key.inode = BPF_CORE_READ(file, f_inode, i_ino);
        key.pid = pid;
        key.tid = tid;
        valuep = bpf_map_lookup_elem(&entries, &key);
        if (!valuep) {
                //初始化hash表
                bpf_map_update_elem(&entries, &key, &zero_value, BPF_ANY);
                valuep = bpf_map_lookup_elem(&entries, &key);
                if (!valuep)
                        return 0;
                valuep->pid = pid;
                valuep->tid = tid;
                bpf_get_current_comm(&valuep->comm, sizeof(valuep->comm));
                get_file_path(file, valuep->filename, sizeof(valuep->filename));
                if (S_ISREG(mode)) {
                        valuep->type = 'R';
                } else if (S_ISSOCK(mode)) {
                        valuep->type = 'S';
                } else {
                        valuep->type = 'O';
                }
        }
        if (op == READ) {
                valuep->reads++;
                valuep->read_bytes += count;
        } else {        /* op == WRITE */
                valuep->writes++;
                valuep->write_bytes += count;
        }
        return 0;
};

filelife

与python工具功能类似,该工具在kprobe/vfs_createkprobe/vfs_openkprobe/security_inode_create处追踪文件新建,其中security_inode_create(检查创建文件权限)作为兜底钩子,在文件创建时记录时间戳记录到map中;在kprobe/vfs_unlink删除文件的函数进入时,从map中读取文件创建/打开时的时间戳,计算时间差,收集文件路径等信息保存,在kretprobe/vfs_unlink时根据返回值是否是0判断删除文件是否成功,成功则将保存的信息发送给用户层

这里主要注意下libpbf(C)中是如何解决内核字段差异处理的,以vfs_create函数为例,在最近的Linux内核版本中,有三种典型的函数声明:

int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
            bool want_excl);
int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
            struct dentry *dentry, umode_t mode, bool want_excl);
int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
            struct dentry *dentry, umode_t mode, bool want_excl);

TODO

SEC("kprobe/vfs_create")
int BPF_KPROBE(vfs_create, void *arg0, void *arg1, void *arg2)
{
        if (renamedata_has_old_mnt_userns_field()
                || renamedata_has_new_mnt_idmap_field())
                return probe_create(arg2);
        else
                return probe_create(arg1);
}

SEC("kprobe/vfs_open")
int BPF_KPROBE(vfs_open, struct path *path, struct file *file)
{
        struct dentry *dentry = BPF_CORE_READ(path, dentry);
        int fmode = BPF_CORE_READ(file, f_mode);

        if (!(fmode & FMODE_CREATED))
                return 0;

        return probe_create(dentry);
}

SEC("kprobe/security_inode_create")
int BPF_KPROBE(security_inode_create, struct inode *dir,
             struct dentry *dentry)
{
        return probe_create(dentry);
}

/**
 * In different kernel versions, function vfs_unlink() has two declarations,
 * and their parameter lists are as follows:
 *
 * int vfs_unlink(struct inode *dir, struct dentry *dentry,
 *        struct inode **delegated_inode);
 * int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
 *        struct dentry *dentry, struct inode **delegated_inode);
 * int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
 *        struct dentry *dentry, struct inode **delegated_inode);
 */
SEC("kprobe/vfs_unlink")
int BPF_KPROBE(vfs_unlink, void *arg0, void *arg1, void *arg2)
{
        u64 id = bpf_get_current_pid_tgid();
        struct unlink_event unlink_event = {};
        struct create_arg *arg;
        u32 tgid = id >> 32;
        u32 tid = (u32)id;
        u64 delta_ns;
        bool has_arg = renamedata_has_old_mnt_userns_field()
                                || renamedata_has_new_mnt_idmap_field();

        arg = has_arg
                ? bpf_map_lookup_elem(&start, &arg2)
                : bpf_map_lookup_elem(&start, &arg1);
        if (!arg)
                return 0;   // missed entry

        delta_ns = bpf_ktime_get_ns() - arg->ts;

        unlink_event.delta_ns = delta_ns;
        unlink_event.tgid = tgid;
        unlink_event.dentry = has_arg ? arg2 : arg1;
        unlink_event.cwd_vfsmnt = arg->cwd_vfsmnt;

        bpf_map_update_elem(&currevent, &tid, &unlink_event, BPF_ANY);
        return 0;
}

SEC("kretprobe/vfs_unlink")
int BPF_KRETPROBE(vfs_unlink_ret)
{
        u64 id = bpf_get_current_pid_tgid();
        u32 tid = (u32)id;
        int ret = PT_REGS_RC(ctx);
        struct unlink_event *unlink_event;
        struct event *eventp;
        struct dentry *dentry;
        const u8 *qs_name_ptr;

        unlink_event = bpf_map_lookup_elem(&currevent, &tid);
        if (!unlink_event)
                return 0;
        bpf_map_delete_elem(&currevent, &tid);

        /* skip failed unlink */
        if (ret)
                return 0;

        eventp = reserve_buf(sizeof(*eventp));
        if (!eventp)
                return 0;

        eventp->tgid = unlink_event->tgid;
        eventp->delta_ns = unlink_event->delta_ns;
        bpf_get_current_comm(&eventp->task, sizeof(eventp->task));

        dentry = unlink_event->dentry;
        qs_name_ptr = BPF_CORE_READ(dentry, d_name.name);
        bpf_probe_read_kernel_str(&eventp->fname.pathes, sizeof(eventp->fname.pathes),
                           qs_name_ptr);
        eventp->fname.depth = 0;

        /* get full-path */
        if (full_path && eventp->fname.pathes[0] != '/')
                bpf_dentry_full_path(eventp->fname.pathes, NAME_MAX,
                                MAX_PATH_DEPTH,
                                unlink_event->dentry,
                                unlink_event->cwd_vfsmnt,
                                &eventp->fname.failed, &eventp->fname.depth);

        bpf_map_delete_elem(&start, &unlink_event->dentry);

        /* output */
        submit_buf(ctx, eventp, sizeof(*eventp));
        return 0;
}

syncsnoop

syncsnoop

0x08 参考