0x00 前言
proc文件系统(procfs)是一种基于内核的VFS,以文件系统目录和文件形式,提供一个指向内核数据结构的接口,通过它能够查看和改变各种系统属性。从开发者角度说,Procfs 是一种特殊的虚拟文件系统,可以挂载到用户的目录树中,允许用户空间中的进程使用系统调用(如write/read等)方便地读取内核信息
本文内核代码基于 v4.11.6 版本

- procfs 它不依赖物理存储设备,而是由内核动态生成文件和目录,用于暴露系统信息(如进程状态、硬件配置、内核参数等),它通过注册到 VFS 的机制,将自己挂载到文件系统树(
/proc),并遵循 VFS 的接口规范(如inode_operations、file_operations) - 内核数据结构:procfs 的内容(如
/proc/cpuinfo、/proc/meminfo等)由内核动态生成,通过回调函数响应读/写操作 - VFS 接口:procfs 需要实现 VFS 定义的操作函数(例如
open()、read()、readdir()),才能被用户空间访问 - procfs 的功能和内容完全由内核管理,通过
mount -t proc proc /proc命令将 procfs 挂载到 VFS 树中,使其成为文件系统的一部分
挂载proc
# Mount the `proc` device under `/proc`
# with the filesystem type specified as
# `proc`.
#
# note.: you could mount procfs pretty much
# anywhere you want and specify any
# device name.
#
#
# .------------> fs type
# |
# | .------> device (dummy identifier
# | | in this case - anything)
# | |
# | | .-> location
# | | |
# | | |
mount -t proc proc /proc
0x01 引言:访问/proc文件
/proc/目录下包含了文件和目录(pid维度)
[root@VM-X-X-tencentos ~]# ls /proc/ -al
total 4
dr-xr-xr-x 270 root root 0 Nov 7 07:06 .
dr-xr-xr-x 19 root root 4096 Nov 7 16:46 ..
dr-xr-xr-x 8 root root 0 Nov 7 07:06 1
......
dr-xr-xr-x 8 root root 0 Nov 7 07:06 15
dr-xr-xr-x 8 root root 0 Nov 7 07:07 996
dr-xr-xr-x 3 root root 0 Nov 7 07:06 acpi
-r--r--r-- 1 root root 0 Nov 7 07:07 bt_stat
-r--r--r-- 1 root root 0 Nov 7 16:45 buddyinfo
dr-xr-xr-x 4 root root 0 Nov 7 07:07 bus
-r--r--r-- 1 root root 0 Nov 7 07:06 cgroups
-r--r--r-- 1 root root 0 Nov 7 07:06 cmdline
-r--r--r-- 1 root root 30094 Nov 7 07:07 config.gz
-r--r--r-- 1 root root 0 Nov 7 16:45 consoles
-r--r--r-- 1 root root 0 Nov 7 07:06 cpuinfo
-r--r--r-- 1 root root 0 Nov 7 16:45 crypto
-r--r--r-- 1 root root 0 Nov 7 16:45 devices
-r--r--r-- 1 root root 0 Nov 7 07:07 diskstats
-r--r--r-- 1 root root 0 Nov 7 16:45 dma
dr-xr-xr-x 4 root root 0 Nov 7 16:45 driver
-r--r--r-- 1 root root 0 Nov 7 16:45 execdomains
-r--r--r-- 1 root root 0 Nov 7 16:45 fb
-r--r--r-- 1 root root 0 Nov 7 07:06 filesystems
dr-xr-xr-x 5 root root 0 Nov 7 07:07 fs
-r--r--r-- 1 root root 0 Nov 7 07:06 interrupts
-r--r--r-- 1 root root 0 Nov 7 07:07 iomem
-r--r--r-- 1 root root 0 Nov 7 16:45 ioports
dr-xr-xr-x 28 root root 0 Nov 7 07:06 irq
-r--r--r-- 1 root root 0 Nov 7 07:07 kallsyms
-r-------- 1 root root 140737477890048 Nov 7 07:06 kcore
-r--r--r-- 1 root root 0 Nov 7 16:45 keys
-r--r--r-- 1 root root 0 Nov 7 16:45 key-users
-r-------- 1 root root 0 Nov 7 16:45 kmsg
-r-------- 1 root root 0 Nov 7 16:45 kpagecgroup
-r-------- 1 root root 0 Nov 7 16:45 kpagecount
-r-------- 1 root root 0 Nov 7 16:45 kpageflags
-rw-r--r-- 1 root root 0 Nov 7 16:45 latency_stats
-r--r--r-- 1 root root 0 Nov 7 07:07 loadavg
-r--r--r-- 1 root root 0 Nov 7 16:45 loadavg_bt
-r--r--r-- 1 root root 0 Nov 7 16:45 locks
-r--r--r-- 1 root root 0 Nov 7 07:17 mdstat
dr-xr-xr-x 2 root root 0 Nov 7 16:45 megaraid
-r--r--r-- 1 root root 0 Nov 7 07:06 meminfo
-r--r--r-- 1 root root 0 Nov 7 16:45 misc
-r--r--r-- 1 root root 0 Nov 7 16:45 module_md5_list
-r--r--r-- 1 root root 0 Nov 7 07:07 modules
lrwxrwxrwx 1 root root 11 Nov 7 07:06 mounts -> self/mounts
dr-xr-xr-x 4 root root 0 Nov 7 16:45 mpt
-rw-r--r-- 1 root root 0 Nov 7 16:45 mtrr
lrwxrwxrwx 1 root root 8 Nov 7 07:06 net -> self/net
-r-------- 1 root root 0 Nov 7 16:45 pagetypeinfo
-r--r--r-- 1 root root 0 Nov 7 07:06 partitions
dr-xr-xr-x 5 root root 0 Nov 7 07:07 pressure
dr-xr-xr-x 3 root root 0 Nov 7 16:45 rue
-r--r--r-- 1 root root 0 Nov 7 16:45 sched_debug
-r--r--r-- 1 root root 0 Nov 7 16:45 schedstat
dr-xr-xr-x 5 root root 0 Nov 7 16:45 scsi
lrwxrwxrwx 1 root root 0 Nov 7 07:06 self -> 26694
-r-------- 1 root root 0 Nov 7 16:45 slabinfo
dr-xr-xr-x 5 root root 0 Nov 7 16:45 sli
-r--r--r-- 1 root root 0 Nov 7 16:45 softirqs
-r--r--r-- 1 root root 0 Nov 7 07:07 stat
-r--r--r-- 1 root root 0 Nov 7 07:06 swaps
dr-xr-xr-x 1 root root 0 Nov 7 07:06 sys
--w------- 1 root root 0 Nov 7 16:45 sysrq-trigger
dr-xr-xr-x 5 root root 0 Nov 7 16:45 sysvipc
lrwxrwxrwx 1 root root 0 Nov 7 07:06 thread-self -> 26694/task/26694
-r-------- 1 root root 0 Nov 7 16:45 timer_list
dr-xr-xr-x 5 root root 0 Nov 7 16:45 tkernel
dr-xr-xr-x 6 root root 0 Nov 7 07:07 tty
-r--r--r-- 1 root root 0 Nov 7 07:06 uptime
-r--r--r-- 1 root root 0 Nov 7 07:07 version
-r-------- 1 root root 0 Nov 7 16:45 vmallocinfo
-r--r--r-- 1 root root 0 Nov 7 07:07 vmstat
-r--r--r-- 1 root root 0 Nov 7 16:45 zoneinfo
比如,访问进程13323下的limits文件,能获取到本进程的资源限制
[root@VM-X-X-tencentos bcc]# cat /proc/13323/limits
Limit Soft Limit Hard Limit Units
Max cpu time unlimited unlimited seconds
Max file size unlimited unlimited bytes
Max data size unlimited unlimited bytes
Max stack size 8388608 unlimited bytes
Max core file size 0 0 bytes
Max resident set unlimited unlimited bytes
Max processes 62825 62825 processes
Max open files 1024 524288 files
Max locked memory 8388608 8388608 bytes
Max address space unlimited unlimited bytes
Max file locks unlimited unlimited locks
Max pending signals 62825 62825 signals
Max msgqueue size 819200 819200 bytes
Max nice priority 0 0
Max realtime priority 0 0
Max realtime timeout unlimited unlimited us
使用bcc的trace工具跟踪上面的cat过程,可以看到最终调用了proc_pid_limits函数
PID TID COMM FUNC
21450 21450 cat proc_pid_limits
proc_pid_limits+0x1
seq_read+0xe5
__vfs_read+0x1b
vfs_read+0x8e #调用链vfs_read
sys_read+0x55
do_syscall_64+0x73
entry_SYSCALL_64_after_hwframe+0x3d

再列举一个例子,访问ls /proc/13323/fd可以知道当前进程13323打开了哪些fd:
[root@VM-X-X-tencentos proc]# ll -ath /proc/13323/fd
total 0
lrwx------ 1 root root 64 Feb 10 10:43 1 -> /dev/pts/1
lrwx------ 1 root root 64 Feb 10 10:43 2 -> /dev/pts/1
lr-x------ 1 root root 64 Feb 10 10:43 3 -> 'pipe:[2640532256]'
lrwx------ 1 root root 64 Feb 10 10:43 4 -> 'socket:[2640532254]'
l-wx------ 1 root root 64 Feb 10 10:43 5 -> 'pipe:[2640532256]'
lr-x------ 1 root root 64 Feb 10 10:43 6 -> /etc/sudoers
lrwx------ 1 root root 64 Feb 10 10:43 7 -> 'socket:[2640532270]'
dr-x------ 2 root root 8 Feb 10 10:43 .
lrwx------ 1 root root 64 Feb 10 10:43 0 -> /dev/pts/1
dr-xr-xr-x 9 root root 0 Feb 10 10:43 ..
上面的命令对应用户态内核态切换如下图:

procs 详情
/proc下面主要包含如下内容:
- 静态生成的(系统初始化生成),比如
/proc/fs、/proc/fb、/proc/net等,这部分子目录在系统初始化时候,应该挂载在proc目录对应的proc_dir_entry链表下 .、..子目录,分别是对当前目录和父目录的链接/proc/下由数字组成的进程子目录是每次读取proc内容动态生成,即动态遍历当前进程列表形成;以及每进程子目录下子目录/子文件的形成
proc下的主要目录&&文件
TODO
0x02 procfs 的内核视角
pseudo FS
核心数据结构
proc_dir_entry:对/proc目录的描述pid_entry:proc目录下面的进程子目录,是对应与每一个进程ID pid目录的描述
proc_dir_entry
proc_dir_entry的结构如下:
struct proc_dir_entry {
unsigned int low_ino;
umode_t mode;
nlink_t nlink;
kuid_t uid;
kgid_t gid;
loff_t size;
const struct inode_operations *proc_iops; // 文件inode操作函数
const struct file_operations *proc_fops; // 文件操作函数
struct proc_dir_entry *parent;
struct rb_root subdir;
struct rb_node subdir_node;
void *data;
atomic_t count; /* use count */
atomic_t in_use; /* number of callers into module in progress; */
/* negative -> it's going away RSN */
struct completion *pde_unload_completion;
struct list_head pde_openers; /* who did ->open, but not ->release */
spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
u8 namelen;
char name[];
};
注意,在高版本的内核中,subdir、subdir_node已经调整为红黑树的实现了,2.6的内核实现是链表。数据结构proc_dir_entry在内核中代表了一个proc入口,在procfs中表现为一个文件,可以在这个结构体中看到一些文件特有的属性成员,如uid、gid、mode、name等
proc_dir_entry的本质是什么?
TODO
pid_entry
struct pid_entry {
const char *name;
unsigned int len;
umode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
union proc_op op;
};
pid_entry下面所有的文件的操作方法都定义在tgid_base_stuff结构中
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
REG("environ", S_IRUSR, proc_environ_operations),
REG("auxv", S_IRUSR, proc_auxv_operations),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
LNK("root", proc_root_link),
LNK("exe", proc_exe_link),
REG("mounts", S_IRUGO, proc_mounts_operations),
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
ONE("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
ONE("cpuset", S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
#endif
#ifdef CONFIG_ELF_CORE
REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
ONE("io", S_IRUSR, proc_tgid_io_accounting),
#endif
#ifdef CONFIG_HARDWALL
ONE("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
REG("timers", S_IRUGO, proc_timers_operations),
#endif
REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
};
tgid_base_stuff结构中的几个宏定义:
1、DIR:目录条目,DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations)表示创建目录/proc/<pid>/task/,该目录包含该进程的所有线程信息,参数分别表示目录名、权限、inode操作、文件操作
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir, //proc_task_readdir:对应的目录遍历方法实现
.llseek = generic_file_llseek,
};
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
.setattr = proc_setattr,
.permission = proc_pid_permission,
};
又如DIR("fd",S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),该目录表示某个进程打开的所有fd列表
[root@VM-X-X-tencentos ~]# ls /proc/1081/fd
0 1 2 3 4
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd, //lookup 对应的lookup方法实现
.permission = proc_fd_permission,
.setattr = proc_setattr,
};
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
.iterate_shared = proc_readfd,
.llseek = generic_file_llseek,
};
2、REG:表示常规文件条目,REG("environ", S_IRUSR, proc_environ_operations),创建一个常规文件,如/proc/<pid>/environ 文件,显示进程的环境变量,参数分别为文件名、权限、文件操作结构体
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/base.c#L974
static const struct file_operations proc_environ_operations = {
.open = environ_open,
.read = environ_read,
.llseek = generic_file_llseek,
.release = mem_release,
};
3、ONE:单一文件条目,ONE("status", S_IRUGO, proc_pid_status),创建一个只读的常规文件,使用简化的回调函数,如/proc/<pid>/status文件是显示进程状态信息,参数分别是文件名、权限、数据生成函数指针。与 REG 的区别是ONE 使用简单的 C,而 REG 需要完整的文件操作结构体
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
struct mm_struct *mm = get_task_mm(task);
task_name(m, task);
task_state(m, ns, pid, task);
if (mm) {
task_mem(m, mm);
mmput(mm);
}
task_sig(m, task);
task_cap(m, task);
task_seccomp(m, task);
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
return 0;
}
4、LNK:符号链接条目,LNK("cwd", proc_cwd_link)表示创建一个符号链接,如/proc/<pid>/cwd 链接指向进程的当前工作目录,参数分别为链接名、链接目标生成函数
[root@VM-X-X-tencentos ~]# ll -alrth /proc/1081/
......
lrwxrwxrwx 1 root root 0 Nov 7 07:07 root -> /
lrwxrwxrwx 1 root root 0 Nov 7 07:07 cwd -> /
[root@VM-X-X-tencentos ~]# ls /proc/1081/cwd/
bin boot data dev etc home lib lib64 lost+found media mnt opt proc root run sbin srv sys tmp usr var
static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
struct task_struct *task = get_proc_task(d_inode(dentry));
int result = -ENOENT;
if (task) {
task_lock(task);
if (task->fs) {
get_fs_pwd(task->fs, path); //返回task->fs指向的目录,保存在path中
result = 0;
}
task_unlock(task);
put_task_struct(task);
}
return result;
}
static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
{
spin_lock(&fs->lock);
*pwd = fs->pwd;
path_get(pwd);
spin_unlock(&fs->lock);
}
上面介绍的回调函数,如struct file_operations/inode_operations等,即当触发内核代码调用VFS架构下的struct file/inode的操作时,会调用响应的操作函数(如file_operations的open/read/llseek等),未定义的操作即不支持
小结
当程序读取/proc下面的文件时,内核的处理如下:

0x03 proc_dir_entry 主要功能分析
本小节主要基于内核代码,分析下/proc及子目录的初始化流程
1、内核初始化创建/proc目录,代码
asmlinkage void __init start_kernel(void)
{
//...
proc_root_init();
//...
}
2、proc_root_init->register_filesystem->proc_sys_init
void __init proc_root_init(void)
{
int err;
proc_init_inodecache();
set_proc_pid_nlink();
err = register_filesystem(&proc_fs_type); // 注册proc文件系统
if (err)
return;
proc_self_init();
proc_thread_self_init();
proc_symlink("mounts", NULL, "self/mounts"); // 创建 mounts 符号链接文件
proc_net_init(); // 创建 net符号链接及内部目录树结构
#ifdef CONFIG_SYSVIPC
proc_mkdir("sysvipc", NULL);
#endif
proc_mkdir("fs", NULL); // 创建 fs 目录
proc_mkdir("driver", NULL); // 创建 drivers 目录
proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
/* just give it a mountpoint */
proc_create_mount_point("openprom");
#endif
proc_tty_init();
proc_mkdir("bus", NULL);
proc_sys_init(); // 创建sys目录并初始化
}
//注册proc文件系统
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/root.c#L116
static struct file_system_type proc_fs_type = {
.name = "proc",
.mount = proc_mount, //挂载入口
.kill_sb = proc_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
3、挂载procfs入口:proc_mount->proc_fill_super,将procfs文件系统挂载到内核全局的VFS树中,调用proc_fill_super完成超级快的初始化工作
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/root.c#L88
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
struct pid_namespace *ns;
if (flags & MS_KERNMOUNT) {
ns = data;
data = NULL;
} else {
ns = task_active_pid_ns(current);
}
return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
}
int proc_fill_super(struct super_block *s, void *data, int silent)
{
struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
struct inode *root_inode;
int ret;
if (!proc_parse_options(data, ns))
return -EINVAL;
/* User space would break if executables or devices appear on proc */
s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10; // 必须是10,2^10=1024
s->s_magic = PROC_SUPER_MAGIC; //magic number,宏的具体数字为0x9fa0
s->s_op = &proc_sops; // 具体的超级块操作,主要涉及的是索引块的操作
s->s_time_gran = 1;
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root); // 转换为vfs具体能识别的索引节点
if (!root_inode) {
pr_err("proc_fill_super: get root inode failed\n");
return -ENOMEM;
}
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/dcache.c#L1855
//初始化根节点,并且与super_block进行关联!
s->s_root = d_make_root(root_inode);
if (!s->s_root) {
pr_err("proc_fill_super: allocate dentry failed\n");
return -ENOMEM;
}
ret = proc_setup_self(s);
if (ret) {
return ret;
}
return proc_setup_thread_self(s);
}
enum {
PROC_ROOT_INO = 1,
};
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/root.c#L204
struct proc_dir_entry proc_root = {
.low_ino = PROC_ROOT_INO, // 根的索引节点号
.namelen = 5, // 根文件名长度、文件名
.mode = S_IFDIR | S_IRUGO | S_IXUGO,
.nlink = 2,
.count = ATOMIC_INIT(1),
.proc_iops = &proc_root_inode_operations, // 根文件的具体索引节点操作
.proc_fops = &proc_root_operations, // 根文件支持的文件操作
.parent = &proc_root,
.subdir = RB_ROOT,
.name = "/proc",
};
上面的proc_root节点不仅包含正常的文件及目录,还要管理进程指定的pid文件,proc_root必须能够处理inode和file,proc_root_inode_operations与proc_root_operations的定义如下:
static struct file_operations proc_root_operations = {
.read = generic_read_dir,
.readdir = proc_root_readdir, //目录遍历
};
/*
* proc root can do almost nothing..
*/
static struct inode_operations proc_root_inode_operations = {
.lookup = proc_root_lookup, //对应vfs架构中的real_lookup函数
.getattr = proc_root_getattr,
};
proc_root_lookup:proc下的inode查找
当用户空间访问proc文件的时候,vfs就会调用real_lookup(),它就会调用inode_operations中的proc_root_lookup函数,实际上就是调用proc_root_lookup()函数
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/root.c#L204
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
// 先查找进程id相关的文件
if (!proc_pid_lookup(dir, dentry, flags)){
return NULL;
}
// 再查找内核运行状态的文件
return proc_lookup(dir, dentry, flags);
}
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/generic.c#L251
struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
return proc_lookup_de(PDE(dir), dir, dentry);
}
proc_pid_lookup与proc_lookup的实现:
//在指定的pid文件夹中查找dentry是否存在
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
int result = -ENOENT;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
// 检查目录是否为数字(快速失败)
tgid = name_to_int(&dentry->d_name);
if (tgid == ~0U)
goto out;
ns = dentry->d_sb->s_fs_info;
rcu_read_lock();
//通过pid查找到指定的task
task = find_task_by_pid_ns(tgid, ns);
if (task)
get_task_struct(task);
rcu_read_unlock();
if (!task)
goto out;
//生成一个新索引节点,并进行缓存
result = proc_pid_instantiate(dir, dentry, task, NULL);
put_task_struct(task);
out:
return ERR_PTR(result);
}
//proc_lookup->proc_lookup_de
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/generic.c#L230
struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
struct dentry *dentry)
{
struct inode *inode;
read_lock(&proc_subdir_lock);
// 从dentry中提取出具体的proc_dir_entry
de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
if (de) {
//找到了
pde_get(de);
read_unlock(&proc_subdir_lock);
inode = proc_get_inode(dir->i_sb, de); // 获取对应的inode
if (!inode)
return ERR_PTR(-ENOMEM);
d_set_d_op(dentry, &simple_dentry_operations); //老套路了,加入到dentry cache中
d_add(dentry, inode);
return NULL;
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
}
为什么proc_root_lookup中的设计要优先进程id其次再内核呢?这是考虑到进程目录数量远大于内核文件数量,先匹配高频访问如进程相关的查找(ps/top 等)比内核状态文件访问更频繁,其次快速失败机制如果查找的不是数字(非PID),proc_pid_lookup 会快速返回失败以切换到查找内核文件
file_operations:proc_root_readdir
proc_root_operations对应的.readdir实现为proc_root_readdir,该函数是procfs 文件系统根目录/proc/的目录遍历实现,它的作用是控制如何列出 /proc 目录下的内容
#define FIRST_PROCESS_ENTRY 256
//https://elixir.bootlin.com/linux/v4.11.6/source/fs/proc/root.c#L170
static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
// 第一阶段:读取内核状态文件(静态文件)
if (ctx->pos < FIRST_PROCESS_ENTRY) {
int error = proc_readdir(file, ctx); // 读取内核相关文件
if (unlikely(error <= 0))
return error; // 出错或读取完成
ctx->pos = FIRST_PROCESS_ENTRY; // 切换到进程条目阶段
}
// 第二阶段:读取进程目录
return proc_pid_readdir(file, ctx); // 读取进程PID目录
}
系统中对于一个目录有多种读取子目录的方式(如ls和ls -al显示的结果不同),这是由传入过程中对file->f_ops设定不同的偏移决定的。对于/proc根目录而言:
f_ops = 0:为.目录链接,链接到自身f_ops = 1:为..目录链接,链接到父目录f_ops如果是在2~FIRST_PROCESS_ENTRY-1之间:表示/proc/下的静态目录或者静态文件f_ops如果是在FIRST_PROCESS_ENTRY~FIRST_PROCESS_ENTRY+ ARRAY_SIZE(proc_base_stuff)-1:为self子目录内容f_ops = FIRST_PROCESS_ENTRY+ ARRAY_SIZE(proc_base_stuff):为init_task即0号初始进程f_pos = PID_MAX_LIMIT + TGID_OFFSET:标识目录遍历结束
继续分析proc_pid_readdir的实现,该函数用于列出 /proc目录时生成进程列表,包括
- 特殊符号链接:
self、thread-self - 所有进程目录:
/proc/1、/proc/2等
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
// pid_namespace 为pid结构的命名空间
struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
loff_t pos = ctx->pos;
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
return 0;
if (pos == TGID_OFFSET - 2) {
struct inode *inode = d_inode(ns->proc_self);
//self
if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
if (pos == TGID_OFFSET - 1) {
struct inode *inode = d_inode(ns->proc_thread_self);
if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
iter.tgid = pos - TGID_OFFSET;
iter.task = NULL;
// next_tgid(ns, iter) 来寻找每一个pid(对应ns命名空间内的)
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
char name[PROC_NUMBUF];
int len;
cond_resched();
if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
continue;
len = snprintf(name, sizeof(name), "%d", iter.tgid);
ctx->pos = iter.tgid + TGID_OFFSET;
if (!proc_fill_cache(file, ctx, name, len,
proc_pid_instantiate, iter.task, NULL)) {
put_task_struct(iter.task);
return 0;
}
}
// 标志着目录遍历结束
ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
return 0;
}
static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
{
struct pid *pid;
if (iter.task)
put_task_struct(iter.task);
rcu_read_lock();
retry:
iter.task = NULL;
pid = find_ge_pid(iter.tgid, ns);
if (pid) {
iter.tgid = pid_nr_ns(pid, ns);
iter.task = pid_task(pid, PIDTYPE_PID);
if (!iter.task || !has_group_leader_pid(iter.task)) {
iter.tgid += 1;
goto retry;
}
get_task_struct(iter.task);
}
rcu_read_unlock();
return iter;
}
0x04 pid_entry主要功能分析
0x05 proc 函数钩子实例分析(进程属性相关)
proc_pid_limit的实现
/proc/x/limits实时反映当前进程的资源限制
[root@VM-X-X-centos ]# cat /proc/4124608/limits

/* Display limits for a process */
static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
unsigned int i;
unsigned long flags;
struct rlimit rlim[RLIM_NLIMITS];
if (!lock_task_sighand(task, &flags))
return 0;
memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
unlock_task_sighand(task, &flags);
/*
* print the file header
*/
seq_printf(m, "%-25s %-20s %-20s %-10s\n",
"Limit", "Soft Limit", "Hard Limit", "Units");
for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
seq_printf(m, "%-25s %-20s ",
lnames[i].name, "unlimited");
else
seq_printf(m, "%-25s %-20lu ",
lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY)
seq_printf(m, "%-20s ", "unlimited");
else
seq_printf(m, "%-20lu ", rlim[i].rlim_max);
if (lnames[i].unit)
seq_printf(m, "%-10s\n", lnames[i].unit);
else
seq_putc(m, '\n');
}
return 0;
}
proc_pid_cmdline的实现
/proc/x/cmdline记录了进程启动时的完整命令行参数
[root@VM-X-X-centos X]# cat /proc/4124608/cmdline
sudosu-root
对应的VFS read实现方法proc_pid_cmdline:通过该方法获取进程对应的cmdline文件的信息
static int proc_pid_cmdline(struct task_struct *task, char * buffer)
{
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
// 获取所有参数的个数
len = mm->arg_end - mm->arg_start;
if (len > PAGE_SIZE)
len = PAGE_SIZE;
// 获取第一个参数
res = access_process_vm(task, mm->arg_start, buffer, len, 0);
// If the nul at the end of args has been overwritten, then
// assume application is using setproctitle(3).
// 循环获取所有的参数
if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
len = strnlen(buffer, res);
if (len < res) {
res = len;
} else {
len = mm->env_end - mm->env_start;
if (len > PAGE_SIZE - res)
len = PAGE_SIZE - res;
// 获取参数,拼接参数得到cmdline
res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
res = strnlen(buffer, res);
}
}
out_mm:
mmput(mm);
out:
return res;
}
0x06 proc 函数钩子实例分析(进程内存相关)
先梳理下进程的内存统计的背景知识,业务进程使用的内存主要有以下几种情况(其中前两者算作进程的RSS,后两者属于page cache)
- 用户空间的匿名映射页(Anonymous pages in User Mode address spaces):比如调用
malloc分配的内存,以及使用MAP_ANONYMOUS的mmap等场景;当系统内存不够时,内核可以将这部分内存交换出去 - 用户空间的文件映射页(Mapped pages in User Mode address spaces):包含map file和map tmpfs;前者比如指定文件的
mmap,后者比如IPC共享内存;当系统内存不够时,内核可以回收这些页,但回收之前可能需要与文件同步数据 - 文件缓存(page in page cache of disk file):发生在程序通过普通的
read/write读写文件时,当系统内存不够时,内核可以回收这些页,但回收之前可能需要与文件同步数据 - buffer pages,属于page cache:比如读取块设备文件
本小节汇总下与内存相关的几个proc文件
/proc/[pid]/stat/proc/[pid]/statm
VSS/RSS/PSS/USS
VSS:(Virtual Set Size)虚拟耗用内存(包含共享库占用的内存),即是进程向系统申请的虚拟内存(包含共享库内存总数),即单个进程全部可访问的地址空间,其大小可能包括还尚未在内存中驻留的部分RSS:(Resident Set Size)常驻内存大小,表示进程实际使用物理内存;是进程在 RAM 中实际保存的总内存(包含共享库占用的共享内存总数)。这个值经常用于进程内存监控,不过有一点需要关注:RSS包含了共享库占用的共享内存总数,然而实际上一个共享库仅会被加载到内存中一次,无论被多少个进程使用PSS:(Proportional Set Size) 实际使用的物理内存(比例分配共享库占用的内存),是单个进程运行时实际占用的物理内存(包含比例分配共享库占用的内存)。对比RSS来说,PSS中的共享库内存是按照比例计算的(若一个共享库有N个进程使用,那么该库比例分配给PSS的大小为1/N),即PSS明确的表示了单个进程在系统总内存中的实际使用量USS:(Unique Set Size) 进程独自占用的物理内存(不包含共享库占用的内存),是进程实际独自占用的物理内存(不包含共享库占用的内存)。USS揭示了单个进程运行中真实的内存增量大小。如果单个进程终止,USS就是实际返还给系统的内存大小
一般来说内存占用大小有如下规律:VSS >= RSS >= PSS >= USS
do_task_stat的实现:/proc/[pid]/stat
/proc/[pid]/stat相关的输出字段:
(23) vsize %lu
Virtual memory size in bytes.
(24) rss %ld
Resident Set Size: number of pages the process has
in real memory. This is just the pages which count
toward text, data, or stack space. This does not
include pages which have not been demand-loaded in,
or which are swapped out.
对应的内核函数为do_task_stat,rss列的计算函数为get_mm_rss,可见RSS的计算包含了MM_FILEPAGES/MM_ANONPAGES/MM_SHMEMPAGES(单位:页)
static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
return get_mm_counter(mm, MM_FILEPAGES) +
get_mm_counter(mm, MM_ANONPAGES) +
get_mm_counter(mm, MM_SHMEMPAGES);
}
注意到,/proc/[pid]/stat的RSS值与/proc/[pid]/statm输出的RSS是相等的
[root@VM-x-x-centos memory]# cat /proc/3447782/stat | awk '{print "RSS(page):", $24}'
RSS(page): 22811
[root@VM-218-158-centos memory]# cat /proc/3447782/statm
330060 22811 3727 1320 0 33217 0
proc_pid_statm的实现
/proc/${PID}/statm 用于记录指定进程的内存使用情况,所有数值以 内存页(Page)为单位(1 页通常为 4 KB),关联内核函数为proc_pid_statm,即当用户态读取 /proc/${PID}/statm 时,内核通过虚拟文件系统(VFS)触发该函数
Provides information about memory usage, measured in pages.
The columns are:
size (1) total program size
(same as VmSize in /proc/[pid]/status)
resident (2) resident set size
(same as VmRSS in /proc/[pid]/status)
share (3) shared pages (i.e., backed by a file)
text (4) text (code)
lib (5) library (unused in Linux 2.6)
data (6) data + stack
dt (7) dirty pages (unused in Linux 2.6)
[root@VM-X-X-centos ~]# cat /proc/3447782/statm
329932/size/ 21419/resident/ 2538/share/ 1320/text/ 0 32065/data/ 0
#size:进程的虚拟内存总量(包括物理内存、交换区、未映射内存等)
#resident:实际驻留物理内存的大小(RSS),即进程当前使用的物理内存
#share:与其他进程共享的物理内存(如共享库、共享内存段)
#text:代码段(可执行指令)占用的物理内存(所有进程共享同一程序的代码段时,此值可能重复计算)
#data:数据段(全局变量、静态数据)和堆栈占用的物理内存(反映进程的堆内存和栈内存使用)
proc_pid_statm函数的实现如下,注意get_mm_counter函数的作用是读取mm_struct结构体的成员数组计数mm->rss_stat.count[index]
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
long val = atomic_long_read(&mm->rss_stat.count[member]);
#ifdef SPLIT_RSS_COUNTING
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
*/
if (val < 0)
val = 0;
#endif
return (unsigned long)val;
}
unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
*shared = get_mm_counter(mm, MM_FILEPAGES) +
get_mm_counter(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}
/*
* For quick read, open code by putting numbers directly
* expected format is
* seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
* size, resident, shared, text, data);
*/
seq_put_decimal_ull(m, "", size);
seq_put_decimal_ull(m, " ", resident);
seq_put_decimal_ull(m, " ", shared);
seq_put_decimal_ull(m, " ", text);
seq_put_decimal_ull(m, " ", 0); //废弃
seq_put_decimal_ull(m, " ", data);
seq_put_decimal_ull(m, " ", 0); //废弃
seq_putc(m, '\n');
return 0;
}