Skip to the content.

https://github.com/sched-ext/scx

先试试这个

https://github.com/parttimenerd/minimal-scheduler

[ 9178.179103] sched_ext: "minimal_scheduler" does not implement cgroup cpu.weight
[ 9178.180274] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180275] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180275] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180275] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180275] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180275] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180275] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180275] sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()
[ 9178.180450] sched_ext: BPF scheduler "minimal_scheduler" enabled

似乎有两个警告,这个代码更新一下:

是不是使用了

测试

的确,在 start.sh 之后,可以观察到很多这个:

@[
    scx_pick_idle_cpu+5
    scx_select_cpu_dfl+445
    select_task_rq_scx+300
    try_to_wake_up+445
    wake_up_q+78
    futex_wake+345
    do_futex+293
    __x64_sys_futex+297
    do_syscall_64+188
    entry_SYSCALL_64_after_hwframe+119
]: 42973
@[
    scx_select_cpu_dfl+5
    select_task_rq_scx+300
    try_to_wake_up+445
    wake_up_q+78
    futex_wake+345
    do_futex+293
    __x64_sys_futex+297
    do_syscall_64+188
    entry_SYSCALL_64_after_hwframe+119
]: 1618732
@[
    scx_bpf_dsq_move_to_local+9
    bpf_prog_3768393fc527e957_sched_dispatch+27
    bpf__sched_ext_ops_dispatch+75
    balance_one+334
    balance_scx+53
    prev_balance+67
    __pick_next_task+107
    __schedule+362
    schedule+65
    futex_wait_queue+101
    __futex_wait+334
    futex_wait+121
    do_futex+203
    __x64_sys_futex+297
    do_syscall_64+188
    entry_SYSCALL_64_after_hwframe+119
]: 1618967

关闭之后,就仅仅可以看到这个:

@[
    scx_tick+9
    sched_tick+221
    update_process_times+150
    tick_nohz_handler+143
    __hrtimer_run_queues+133
    hrtimer_interrupt+255
    __sysvec_apic_timer_interrupt+82
    sysvec_apic_timer_interrupt+110
    asm_sysvec_apic_timer_interrupt+26
    default_idle+15
    default_idle_call+63
    do_idle+463
    cpu_startup_entry+41
    start_secondary+286
    common_startup_64+318
]: 7932

具体代码

难道真的是这个影响的吗?

struct task_struct {
  // ...
	const struct sched_class	*sched_class;

似乎是的,例如看这个例子:

例如 enqueue_task 中

void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
	if (!(flags & ENQUEUE_NOCLOCK))
		update_rq_clock(rq);

	p->sched_class->enqueue_task(rq, p, flags);
	/*
	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
	 * ->sched_delayed.
	 */
	uclamp_rq_inc(rq, p);

	psi_enqueue(p, flags);

	if (!(flags & ENQUEUE_RESTORE))
		sched_info_enqueue(rq, p);

	if (sched_core_enabled(rq))
		sched_core_enqueue(rq, p);
}

当被 wake up 的时候,总是需要先加入到队列中,然后从队列中离开:

@[
    dequeue_entity+1
    dequeue_entities+289
    dequeue_task_fair+151
    __schedule+1940
    schedule+65
    futex_wait_queue+101
    __futex_wait+334
    futex_wait+121
    do_futex+203
    __x64_sys_futex+297
    do_syscall_64+188
    entry_SYSCALL_64_after_hwframe+119
]: 279

在中断的时候加入到队列中:

@[
    enqueue_task_fair+5
    enqueue_task+55
    ttwu_do_activate+111
    sched_ttwu_pending+245
    __flush_smp_call_function_queue+320
    __sysvec_call_function_single+28
    sysvec_call_function_single+110
    asm_sysvec_call_function_single+26
    default_idle+15
@[
    enqueue_task_fair+5
    enqueue_task+55
    ttwu_do_activate+111
    sched_ttwu_pending+245
    __flush_smp_call_function_queue+320
    __sysvec_call_function_single+28
    sysvec_call_function_single+110
    asm_sysvec_call_function_single+26
    default_idle+15
    default_idle_call+63
    do_idle+463
    cpu_startup_entry+41
    start_secondary+286
    common_startup_64+318
]: 1878

看看 sysfs 的结果

/sys/kernel/sched_ext🔒 🐕
🧀  tree
.
├── enable_seq
├── hotplug_seq
├── nr_rejected
├── root
│   └── ops
├── state
└── switch_all

2 directories, 6 files

tools/sched_ext/ 是做什么的?

kernel 中的 tools/sched_ext/ 做什么的?

当时使用了 ebpf 吗?

https://github.com/google/ghost-userspace

SCHED_EXT

https://mp.weixin.qq.com/s/89PuLJDE4aE1c3cWG6ZL8g

获取一点代码的基本感觉

基本的感觉是在这里的:

DEFINE_SCHED_CLASS : 所有的都是有的

DEFINE_SCHED_CLASS(ext) = {
	.enqueue_task		= enqueue_task_scx,
	.dequeue_task		= dequeue_task_scx,
	.yield_task		= yield_task_scx,
	.yield_to_task		= yield_to_task_scx,

	.wakeup_preempt		= wakeup_preempt_scx,

	.balance		= balance_scx,
	.pick_task		= pick_task_scx,

	.put_prev_task		= put_prev_task_scx,
	.set_next_task		= set_next_task_scx,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_scx,
	.task_woken		= task_woken_scx,
	.set_cpus_allowed	= set_cpus_allowed_scx,

	.rq_online		= rq_online_scx,
	.rq_offline		= rq_offline_scx,
#endif

	.task_tick		= task_tick_scx,

	.switching_to		= switching_to_scx,
	.switched_from		= switched_from_scx,
	.switched_to		= switched_to_scx,
	.reweight_task		= reweight_task_scx,
	.prio_changed		= prio_changed_scx,

	.update_curr		= update_curr_scx,

#ifdef CONFIG_UCLAMP_TASK
	.uclamp_enabled		= 1,
#endif
};

sched_ext_ops : 对于用户提供的 scheduler

static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
	.select_cpu		= sched_ext_ops__select_cpu,
	.enqueue		= sched_ext_ops__enqueue,
	.dequeue		= sched_ext_ops__dequeue,
	.dispatch		= sched_ext_ops__dispatch,
	.tick			= sched_ext_ops__tick,
	.runnable		= sched_ext_ops__runnable,
	.running		= sched_ext_ops__running,
	.stopping		= sched_ext_ops__stopping,
	.quiescent		= sched_ext_ops__quiescent,
	.yield			= sched_ext_ops__yield,
	.core_sched_before	= sched_ext_ops__core_sched_before,
	.set_weight		= sched_ext_ops__set_weight,
	.set_cpumask		= sched_ext_ops__set_cpumask,
	.update_idle		= sched_ext_ops__update_idle,
	.cpu_acquire		= sched_ext_ops__cpu_acquire,
	.cpu_release		= sched_ext_ops__cpu_release,
	.init_task		= sched_ext_ops__init_task,
	.exit_task		= sched_ext_ops__exit_task,
	.enable			= sched_ext_ops__enable,
	.disable		= sched_ext_ops__disable,
#ifdef CONFIG_EXT_GROUP_SCHED
	.cgroup_init		= sched_ext_ops__cgroup_init,
	.cgroup_exit		= sched_ext_ops__cgroup_exit,
	.cgroup_prep_move	= sched_ext_ops__cgroup_prep_move,
	.cgroup_move		= sched_ext_ops__cgroup_move,
	.cgroup_cancel_move	= sched_ext_ops__cgroup_cancel_move,
	.cgroup_set_weight	= sched_ext_ops__cgroup_set_weight,
#endif
	.cpu_online		= sched_ext_ops__cpu_online,
	.cpu_offline		= sched_ext_ops__cpu_offline,
	.init			= sched_ext_ops__init,
	.exit			= sched_ext_ops__exit,
	.dump			= sched_ext_ops__dump,
	.dump_cpu		= sched_ext_ops__dump_cpu,
	.dump_task		= sched_ext_ops__dump_task,
};

为什么有 kthread 程序在不断的调整自己的 level

#0  __sched_setscheduler (p=p@entry=0xffff888004393280,
    attr=0xffffc9000013beb8, user=user@entry=false, pi=pi@entry=true)
    at kernel/sched/syscalls.c:529
#1  0xffffffff8112d92b in _sched_setscheduler (
    param=0xffffffff8220eb00 <param>, check=false,
    policy=<optimized out>, p=0xffff888004393280)
    at kernel/sched/syscalls.c:788
#2  sched_setscheduler_nocheck (p=p@entry=0xffff888004393280,
    policy=policy@entry=0,
    param=param@entry=0xffffffff8220eb00 <param>)
    at kernel/sched/syscalls.c:835
#3  0xffffffff810dacb1 in kthread (_create=0xffff888004255f80)
    at ./arch/x86/include/asm/current.h:47
#4  0xffffffff810487b1 in ret_from_fork (prev=<optimized out>,
    regs=0xffffc9000013bf58, fn=0xffffffff810dac50 <kthread>,
    fn_arg=0xffff888004255f80) at arch/x86/kernel/process.c:147
#5  0xffffffff810036ea in ret_from_fork_asm ()
    at arch/x86/entry/entry_64.S:244

__sched_setscheduler 中在不断在判断 user 这个变量。

此外,如何理解: __sched_setscheduler 中 man sched_setscheduler(2)

	/*
	 * Valid priorities for SCHED_FIFO and SCHED_RR are
	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
	 * SCHED_BATCH and SCHED_IDLE is 0.
	 */
	if (attr->sched_priority > MAX_RT_PRIO-1)
		return -EINVAL;
	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
	    (rt_policy(policy) != (attr->sched_priority != 0)))
		return -EINVAL;
Currently,  Linux  supports  the following "normal" (i.e., non-
real-time) scheduling policies as values that may be  specified
in policy:

SCHED_OTHER   the standard round-robin time-sharing policy;

SCHED_BATCH   for "batch" style execution of processes; and

SCHED_IDLE    for running very low priority background jobs.

For  each  of the above policies, param->sched_priority must be 0.

看看

https://mp.weixin.qq.com/s/d043Be7vfXaq3HFNxUdfLw

https://mp.weixin.qq.com/s/B6zeBR-h-vLgTBe5CwGTSg

https://mp.weixin.qq.com/s/SE1W3IbYP8jeNX5rUDCaGg

https://mp.weixin.qq.com/s/WAPL5h4pigZ3m0-QMi-c0A

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。