Skip to the content.

sysfs sched

分析下 linux/kernel/sched/debug.c 中的内容

/proc/schedstat && /proc/pid/schedstat

实现文件 : sched/stat.c ,一共也就 200 行

/proc/pid/schedstat

/proc/pid/schedstat 的含义:

time spent on the cpu (in nanoseconds)
time spent waiting on a runqueue (in nanoseconds)
# of timeslices run on this cpu
struct sched_info {
#ifdef CONFIG_SCHED_INFO
	/* Cumulative counters: */

	/* # of times we have run on this CPU: */
	unsigned long			pcount;

	/* Time spent waiting on a runqueue: */
	unsigned long long		run_delay;

	/* Timestamps: */

	/* When did we last run on a CPU? */
	unsigned long long		last_arrival;

	/* When were we last queued to run? */
	unsigned long long		last_queued;

#endif /* CONFIG_SCHED_INFO */
};

[ ] 密切关联的

config TASK_DELAY_ACCT
	bool "Enable per-task delay accounting"
	depends on TASKSTATS
	select SCHED_INFO
	help
	  Collect information on time spent by a task waiting for system
	  resources like cpu, synchronous block I/O completion and swapping
	  in pages. Such statistics can help in setting a task's priorities
	  relative to other tasks for cpu, io, rss limits etc.

	  Say N if unsure.

[ ] 回答这个问题

https://stackoverflow.com/questions/52736873/understanding-scheduling-and-proc-pid-sched

/proc/pid/sched

fs/proc/base.c 中 sched_show -> proc_sched_show_task 关联的 config 为 CONFIG_SCHED_DEBUG 。其中的内容是受到 CONFIG_SCHEDSTATS 影响的。

实现的函数在:

qemu (11176, #threads: 124)
-------------------------------------------------------------------
se.exec_start                                :     140935799.916922
se.vruntime                                  :         10268.049254
se.sum_exec_runtime                          :         43129.162737
se.nr_migrations                             :                 3645
nr_switches                                  :               218495
nr_voluntary_switches                        :               218380
nr_involuntary_switches                      :                  115
se.load.weight                               :              1048576
se.runnable_weight                           :              1048576
se.avg.load_sum                              :                   19
se.avg.runnable_load_sum                     :                   19
se.avg.util_sum                              :                19456
se.avg.load_avg                              :                    0
se.avg.runnable_load_avg                     :                    0
se.avg.util_avg                              :                    0
se.avg.last_update_time                      :      140935799916544
se.avg.util_est.ewma                         :                   10
se.avg.util_est.enqueued                     :                    0
policy                                       :                    0
prio                                         :                  120
clock-delta                                  :                   30
mm->numa_scan_seq                            :                   61
numa_pages_migrated                          :                  724
numa_preferred_nid                           :                    5
total_numa_faults                            :                 3440
current_node=5, numa_group_id=11176
numa_faults node=0 task_private=1 task_shared=1 group_private=1 group_shared=1
numa_faults node=1 task_private=1 task_shared=1 group_private=1 group_shared=1
numa_faults node=2 task_private=1 task_shared=1 group_private=1 group_shared=1
numa_faults node=3 task_private=1 task_shared=0 group_private=1 group_shared=0
numa_faults node=4 task_private=1 task_shared=1 group_private=1 group_shared=1
numa_faults node=5 task_private=3427 task_shared=1 group_private=3427 group_shared=1
numa_faults node=6 task_private=1 task_shared=1 group_private=1 group_shared=1
numa_faults node=7 task_private=1 task_shared=0 group_private=1 group_shared=0

/proc/pid/status

实现函数 : proc_pid_status

voluntary_ctxt_switches:        0
nonvoluntary_ctxt_switches:     0
x86_Thread_features:
x86_Thread_features_locked:

/proc/stat

文档: https://man7.org/linux/man-pages/man5/proc_stat.5.html

可以通过 fs/proc/stat.c:show_stat 将各种时间的统计开始结束时间确定下

		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
		user		= cpustat[CPUTIME_USER];
		nice		= cpustat[CPUTIME_NICE];
		system		= cpustat[CPUTIME_SYSTEM];
		idle		= get_idle_time(&kcpustat, i);
		iowait		= get_iowait_time(&kcpustat, i);
		irq		= cpustat[CPUTIME_IRQ];
		softirq		= cpustat[CPUTIME_SOFTIRQ];
		steal		= cpustat[CPUTIME_STEAL];
		guest		= cpustat[CPUTIME_GUEST];
		guest_nice	= cpustat[CPUTIME_GUEST_NICE];

通过 /proc/stat 来实现 top 1 ,展示各个 CPU 在各种任务上话费了多少时间。

debugfs

/sys/kernel/debug/sched

debug

展示每一个 cfs_rq 的信息,也就是 cgroup 层级 * cpu 数量

cfs_rq[31]:/test
  .exec_clock                    : 0.000000
  .MIN_vruntime                  : 0.000001
  .min_vruntime                  : 69643.566731
  .max_vruntime                  : 0.000001
  .spread                        : 0.000000
  .spread0                       : -854710191.753634
  .nr_spread_over                : 0
  .nr_running                    : 1
  .h_nr_running                  : 1
  .idle_nr_running               : 0
  .idle_h_nr_running             : 0
  .load                          : 1048576
  .load_avg                      : 1024
  .runnable_avg                  : 1024
  .util_avg                      : 1019
  .util_est_enqueued             : 0
  .removed.load_avg              : 0
  .removed.util_avg              : 0
  .removed.runnable_avg          : 0
  .tg_load_avg_contrib           : 1024
  .tg_load_avg                   : 32763
  .throttled                     : 0
  .throttle_count                : 0
  .se->exec_start                : 1083661332.978182
  .se->vruntime                  : 1142880506.076297
  .se->sum_exec_runtime          : 69682.607166
  .se->load.weight               : 32773
  .se->avg.load_avg              : 32
  .se->avg.util_avg              : 1019
  .se->avg.runnable_avg          : 1024

features

GENTLE_FAIR_SLEEPERS START_DEBIT NO_NEXT_BUDDY LAST_BUDDY CACHE_HOT_BUDDY WAKEUP_PREEMPTION NO_HRTICK NO_HRTICK_DL NO_DOUBLE_TICK NONTASK_CAPACITY TTWU_QUEUE NO_SIS_PROP SIS_UTIL NO_WARN_DOUBLE_CLOCK RT_PUSH_IPI NO_RT_RUNTIME_SHARE NO_LB_MIN ATTACH_AGE_LOAD WA_IDLE WA_WEIGHT WA_BIAS UTIL_EST UTIL_EST_FASTUP NO_LATENCY_WARN ALT_PERIOD BASE_SLICE

idle_min_granularity_ns

750000

latency_ns

latency_warn_ms

latency_warn_once

migration_cost_ns

min_granularity_ns

nr_migrate

preempt

tunable_scaling

verbose

wakeup_granularity_ns

echo w | sudo tee /proc/sysrq-trigger

/proc 创建目录和 /proc/pid/task 下的目录都是动态生成的

很显然,就是 thread group 组成 /proc :

/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file *file, struct dir_context *ctx)

忽然意识到,这个目录是查询的过程中动态展示出来的,参考

proc_pid_make_inode

#0  proc_pid_make_inode (sb=0xffff888103c28000, task=task@entry=0xffff8881109b8000, mode=33188) at fs/proc/base.c:1890
#1  0xffffffff814c8f15 in proc_pident_instantiate (dentry=dentry@entry=0xffff88811da25680, task=task@entry=0xffff8881109b8000, ptr=ptr@entry=0xffffffff8264c0e8 <tgid_base_stuff+520>) at fs/proc/base.c:2643
#2  0xffffffff814c908e in proc_pident_lookup (end=0xffffffff8264c6b0, p=0xffffffff8264c0e8 <tgid_base_stuff+520>, dentry=0xffff88811da25680, dir=<optimized out>) at fs/proc/base.c:2679
#3  proc_tgid_base_lookup (dir=<optimized out>, dentry=0xffff88811da25680, flags=<optimized out>) at fs/proc/base.c:3378
#4  0xffffffff8143f15a in lookup_open (op=0xffffc90000017edc, op=0xffffc90000017edc, got_write=false, file=0xffff888106eceb00, nd=0xffffc90000017dc0) at fs/namei.c:3394
#5  open_last_lookups (op=0xffffc90000017edc, file=0xffff888106eceb00, nd=0xffffc90000017dc0) at fs/namei.c:3484
#6  path_openat (nd=nd@entry=0xffffc90000017dc0, op=op@entry=0xffffc90000017edc, flags=flags@entry=65) at fs/namei.c:3712
#7  0xffffffff81440d06 in do_filp_open (dfd=dfd@entry=-100, pathname=pathname@entry=0xffff888110a72000, op=op@entry=0xffffc90000017edc) at fs/namei.c:3742
#8  0xffffffff81427aea in do_sys_openat2 (dfd=-100, filename=<optimized out>, how=how@entry=0xffffc90000017f18) at fs/open.c:1356
#9  0xffffffff81427fe7 in do_sys_open (mode=<optimized out>, flags=<optimized out>, filename=<optimized out>, dfd=<optimized out>) at fs/open.c:1372
#10 __do_sys_openat (mode=<optimized out>, flags=<optimized out>, filename=<optimized out>, dfd=<optimized out>) at fs/open.c:1388
#11 __se_sys_openat (mode=<optimized out>, flags=<optimized out>, filename=<optimized out>, dfd=<optimized out>) at fs/open.c:1383
#12 __x64_sys_openat (regs=<optimized out>) at fs/open.c:1383
#13 0xffffffff822a0f0c in do_syscall_x64 (nr=<optimized out>, regs=0xffffc90000017f58) at arch/x86/entry/common.c:50
#14 do_syscall_64 (regs=0xffffc90000017f58, nr=<optimized out>) at arch/x86/entry/common.c:80

/proc/pid

主要看看其中的软链接是如何实现的?

/proc/pid/stat

获取一个 process 上次运行所在的 CPU :

awk '{print "PID: " $1, "Last CPU: " $39}' /proc/self/stat

/proc/pid/cmdline

proc_pid_cmdline_ops

实现的非常复杂,不是一行的事

/proc/pid/pwd

然后获取:

struct task_struct {
  // ...
	/* Filesystem information: */
	struct fs_struct		*fs;
  // ...

如果一个进程的 pwd 被删除了,在 /proc/10204/cwd 将是:

cwd -> '/root/gg (deleted)'

如果重新创建 /root/gg ,输出还是如此。

这个现象可以继续跟踪一下。

/proc/sys/kernel 和 sched 相关控制项目

https://docs.kernel.org/admin-guide/sysctl/kernel.html

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。