- 难道,只有 fair 才可以实现被 cgroup 管理?
- 似乎也有道理
- sched_class::task_change_group 注册为 task_change_group_fair
- detach_task_cfs_rq
- detach_entity_cfs_rq
- attach_task_cfs_rq
- detach_task_cfs_rq
@[
task_change_group_fair+5
sched_move_task+151
cpu_cgroup_attach+54
cgroup_migrate_execute+909
cgroup_attach_task+326
__cgroup_procs_write+244
cgroup_procs_write+23
kernfs_fop_write_iter+300
vfs_write+706
ksys_write+99
do_syscall_64+60
entry_SYSCALL_64_after_hwframe+114
]: 1
@[
task_change_group_fair+5
sched_move_task+151
autogroup_move_group+147
sched_autogroup_create_attach+169
ksys_setsid+234
__x64_sys_setsid+14
do_syscall_64+60
entry_SYSCALL_64_after_hwframe+114
]: 1
@[
task_change_group_fair+5
sched_move_task+151
do_exit+853
__x64_sys_exit+27
do_syscall_64+60
entry_SYSCALL_64_after_hwframe+114
]: 116
- autogroup_move_group : autogroup 机制
- cpu_cgroup_attach :
- sched_move_task
- sched_change_group
- sched_move_task
struct sched_entity {
#ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
}
struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
struct list_head leaf_cfs_rq_list;
struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
}
CONFIG_CFS_BANDWIDTH
参考资料:
- https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt
CFS bandwidth control is a
CONFIG_FAIR_GROUP_SCHEDextension which allows the specification of the maximum CPU bandwidth available to a group or hierarchy.The bandwidth allowed for a group is specified using a quota and period. Within each given “period” (microseconds), a group is allowed to consume only up to “quota” microseconds of CPU time. When the CPU bandwidth consumption of a group exceeds this limit (for that period), the tasks belonging to its hierarchy will be throttled and are not allowed to run again until the next period.
总结到位。
#0 walk_tg_tree_from (from=0xffff888105cf9500, down=down@entry=0xffffffff8118f170 <tg_throttle_down>, up=0xffffffff81187820 <tg_nop>, data=data@entry=0xffff8883339ee3c0) at kernel/sched/core.c:1247
#1 0xffffffff81193b00 in throttle_cfs_rq (cfs_rq=0xffff888114767e00) at kernel/sched/fair.c:5432
#2 0xffffffff8119d2ed in pick_next_task_fair (rq=rq@entry=0xffff8883339ee3c0, prev=prev@entry=0xffff888108634600, rf=rf@entry=0xffffc90002577eb0) at kernel/sched/fair.c:8030
#3 0xffffffff822b718b in __pick_next_task (rf=0xffffc90002577eb0, prev=0xffff888108634600, rq=0xffff8883339ee3c0) at kernel/sched/core.c:5972
#4 pick_next_task (rf=0xffffc90002577eb0, prev=0xffff888108634600, rq=0xffff8883339ee3c0) at kernel/sched/core.c:6047
#5 __schedule (sched_mode=sched_mode@entry=0) at kernel/sched/core.c:6633
#6 0xffffffff822b82f2 in schedule () at kernel/sched/core.c:6745
#7 0xffffffff811f7abd in exit_to_user_mode_loop (ti_work=8, regs=<optimized out>) at kernel/entry/common.c:159
#8 exit_to_user_mode_prepare (regs=0xffffc90002577f58) at kernel/entry/common.c:204
#9 0xffffffff822aafd9 in irqentry_exit_to_user_mode (regs=<optimized out>) at kernel/entry/common.c:310
#10 0xffffffff8240148a in asm_sysvec_apic_timer_interrupt () at ./arch/x86/include/asm/idtentry.h:645
依赖关系是什么
if CGROUP_SCHED
config FAIR_GROUP_SCHED
bool "Group scheduling for SCHED_OTHER"
depends on CGROUP_SCHED
default CGROUP_SCHED
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
help
This option optimizes the scheduler for common desktop workloads by
automatically creating and populating task groups. This separation
of workloads isolates aggressive CPU burners (like build jobs) from
desktop applications. Task group autogeneration is currently based
upon task session.
- CGROUP 下才有 FAIR_GROUP_SCHED,之后才有的 SCHED_AUTOGROUP
- 但是很多时候都会选择 SCHED_AUTOGROUP 之后,那么 CGROUP 和 FAIR_GROUP_SCHED 就会自动存在
所以看上去,存在一堆 CONFIG,实际上,没有意义,默认都是打开的。
struct task_group CONFIG_FAIR_GROUP_SCHED 以及 CONFIG_CFS_BANDWIDTH 三者逐渐递进的
struct task_group
每一个 css 都会被自己的结构体中包含
这是 cpuset 的:
struct cpuset {
struct cgroup_subsys_state css;
/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each CPU */
struct sched_entity **se;
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
* it in its own cacheline separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
struct rt_bandwidth rt_bandwidth;
#endif
struct rcu_head rcu;
struct list_head list;
struct task_group *parent;
struct list_head siblings;
struct list_head children;
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
struct cfs_bandwidth cfs_bandwidth;
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* The two decimal precision [%] value requested from user-space */
unsigned int uclamp_pct[UCLAMP_CNT];
/* Clamp values requested for a task group */
struct uclamp_se uclamp_req[UCLAMP_CNT];
/* Effective clamp values used for a task group */
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
这里包含的内容,完全就是没搞清楚的那些内容。
- task_group 会为每个 CPU 再维护一个 cfs_rq,这个 cfs_rq 用于组织挂在这个任务组上的任务以及子任务组,参考图中的 Group A;
- 调度器在调度的时候,比如调用
pick_next_task_fair时,会从遍历队列,选择 sched_entity,如果发现 sched_entity 对应的是 task_group,则会继续往下选择; - 由于 sched_entity 结构中存在 parent 指针,指向它的父结构,因此,系统的运行也能从下而上的进行遍历操作,通常使用函数
walk_tg_tree_from进行遍历;
创建 task_group 存在两种原因:
- ssh 因为 login 机制的时候,也会创建出来一个 group 的,首先创建
#0 alloc_fair_sched_group (tg=tg@entry=0xffff88810c7a0600, parent=parent@entry=0xffffffff83e9b340 <root_task_group>) at kernel/sched/fair.c:12362 #1 0xffffffff8118ae57 in sched_create_group (parent=0xffffffff83e9b340 <root_task_group>) at kernel/sched/core.c:10358 #2 0xffffffff811af83f in autogroup_create () at kernel/sched/build_utility.c:11034 #3 sched_autogroup_create_attach (p=0xffff888108635780) at kernel/sched/build_utility.c:11136 #4 0xffffffff8115f98a in ksys_setsid () at kernel/sys.c:1247 #5 0xffffffff8115f9ce in __do_sys_setsid (__unused=<optimized out>) at kernel/sys.c:1254 #6 0xffffffff822a5e5c in do_syscall_x64 (nr=<optimized out>, regs=0xffffc90001867f58) at arch/x86/entry/common.c:50 #7 do_syscall_64 (regs=0xffffc90001867f58, nr=<optimized out>) at arch/x86/entry/common.c:80 #8 0xffffffff824000ae in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120 #9 0x0000000000000000 in ?? () - 因为构建 cgroup 而创建的:
#0 alloc_fair_sched_group (tg=tg@entry=0xffff88810ff39500, parent=parent@entry=0xffffffff83e9b340 <root_task_group>) at kernel/sched/fair.c:12362 #1 0xffffffff8118ae57 in sched_create_group (parent=0xffffffff83e9b340 <root_task_group>) at kernel/sched/core.c:10358 #2 0xffffffff8118af13 in cpu_cgroup_css_alloc (parent_css=<optimized out>) at kernel/sched/core.c:10523 #3 0xffffffff8123234b in css_create (ss=0xffffffff8306f840 <cpu_cgrp_subsys>, cgrp=0xffff8881139b8800) at kernel/cgroup/cgroup.c:5541 #4 cgroup_apply_control_enable (cgrp=cgrp@entry=0xffffffff83168350 <cgrp_dfl_root+16>) at kernel/cgroup/cgroup.c:3249 #5 0xffffffff81233d4b in cgroup_apply_control (cgrp=0xffffffff83168350 <cgrp_dfl_root+16>) at kernel/cgroup/cgroup.c:3331 #6 cgroup_subtree_control_write (of=0xffff888105e40d80, buf=<optimized out>, nbytes=4, off=<optimized out>) at kernel/cgroup/cgroup.c:3485 #7 0xffffffff814dec69 in kernfs_fop_write_iter (iocb=0xffffc9000185fea0, iter=<optimized out>) at fs/kernfs/file.c:334 #8 0xffffffff8142da2f in call_write_iter (iter=0xffffffff83e9b340 <root_task_group>, kio=0xffff88810ff39500, file=0xffff88810659cd00) at ./include/linux/fs.h:1868 #9 new_sync_write (ppos=0xffffc9000185ff08, len=4, buf=0x1b86d20 "+cpu", filp=0xffff88810659cd00) at fs/read_write.c:491 #10 vfs_write (file=file@entry=0xffff88810659cd00, buf=buf@entry=0x1b86d20 "+cpu", count=count@entry=4, pos=pos@entry=0xffffc9000185ff08) at fs/read_write.c:584 #11 0xffffffff8142dea3 in ksys_write (fd=<optimized out>, buf=0x1b86d20 "+cpu", count=4) at fs/read_write.c:637 #12 0xffffffff822a5e5c in do_syscall_x64 (nr=<optimized out>, regs=0xffffc9000185ff58) at arch/x86/entry/common.c:50 #13 do_syscall_64 (regs=0xffffc9000185ff58, nr=<optimized out>) at arch/x86/entry/common.c:80 #14 0xffffffff824000ae in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120
根据 mq 指针来区分,到底指向是 指的是一个 task,而有的时候是 task group 的
#ifdef CONFIG_FAIR_GROUP_SCHED
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
#else
#define entity_is_task(se) 1
#endif
本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。