balance

`select_idle_*` 的作用是什么

select_idle_sibling

/*
 * Try and locate an idle core/thread in the LLC cache domain.
 */
static int select_idle_sibling(struct task_struct *p, int prev, int target)

调用位置

/*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 *
 * Balances load by selecting the idlest CPU in the idlest group, or under
 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
 *
 * Returns the target CPU number.
 *
 * preempt must be disabled.
 */
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)

task_numa_compare
- task_numa_find_cpu
  - task_numa_migrate
    - numa_migrate_perfer
      - task_numa_fault : 这个函数调用位置来自于 memory.c huge_memory.c 中的 do_numa_page 之类的函数

select_idle_sibling 中间的部分片段:

	i = select_idle_core(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
		return i;

	i = select_idle_cpu(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
		return i;

	i = select_idle_smt(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
		return i;

lb 的关键内容 : find_busiest_group 和 find_busiest_queue

被 load_balance 唯一调用

/*
 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
 */
static struct rq *find_busiest_queue(struct lb_env *env,
				     struct sched_group *group)

在 group 中间的找到 cfs_rq，因为迁移都是在文件夹中间进行迁移的。

task_group 是描述，

/**
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance.
 *
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
 * @env: The load balancing environment.
 *
 * Return:	- The busiest group if imbalance exists.
 */
static struct sched_group *find_busiest_group(struct lb_env *env)

find_busiest_group 的 helper 函数 7000 ~ 8200 。

find_idlest_cpu 和 find_idlest_group : select_task_rq_fair 相关的 @todo 为什么 lb 只需要 busiest 而不需要 idlest 的

load_balance 的实现

kernel/sched/fair.c:6798 的注释

了解一下其中的函数

触发 rebalance 的方法和位置是什么

domain 的概念是什么 ?

idle_balance : schedule() 调用，应该是最容易分析了
rebalance_domains:
  load_balance : 核心业务 ?

_nohz_idle_balance()
run_rebalance_domains : 被注册到 softirq 中间了
  rebalance_domains

nohz_idle_balance(): 被 run_rebalance_domains 唯一调用
nohz_newidle_balance(): 被 idle_balance 唯一调用
  __nohz_idle_balance():


scheduler_tick()
  tigger_load_balance
    nohz_balance_kick() : 值的关注一下，好像和之前的所有的东西都不是一个东西呀!

好像就是 softirq 触发的，然后进行整个流程走一下以及从 timer 中间触发!

首先，理解了

https://docs.kernel.org/admin-guide/pm/intel-speed-select.html

打印出来当前正在 CPU 上运行的:

ps -e -o pid,ppid,sgi_p,state,args | awk '$3!="*" {print}'

Scheduler Domain 从 acpi 中获取吗?
(CONFIG_ENERGY_MODEL) && (CONFIG_CPU_FREQ_GOV_SCHEDUTIL) : 做什么的 ?
get_group 和 build_balance_mask 上有非常多暂时看不懂的注释

topology

build_balance_mask
- span
- sched_domain_topology_level
sched_group
root_domain
- init_rootdomain
cpu_attach_domain
degenerate
rq_attach_root

如何初始化

什么时候切换

大核小核的影响

如何实现 taskset 的效果

关键结构体

root_domain

/*
 * We add the notion of a root-domain which will be used to define per-domain
 * variables. Each exclusive cpuset essentially defines an island domain by
 * fully partitioning the member CPUs from any other cpuset. Whenever a new
 * exclusive cpuset is created, we also create and attach a new root-domain
 * object.
 *
 */
struct root_domain {
	atomic_t		refcount;
	atomic_t		rto_count;
	struct rcu_head		rcu;
	cpumask_var_t		span;
	cpumask_var_t		online;

	/*
	 * Indicate pullable load on at least one CPU, e.g:
	 * - More than one runnable task
	 * - Running task is misfit
	 */
	int			overload;

	/* Indicate one or more cpus over-utilized (tipping point) */
	int			overutilized;

	/*
	 * The bit corresponding to a CPU gets set here if such CPU has more
	 * than one runnable -deadline task (as it is below for RT tasks).
	 */
	cpumask_var_t		dlo_mask;
	atomic_t		dlo_count;
	struct dl_bw		dl_bw;
	struct cpudl		cpudl;

	/*
	 * Indicate whether a root_domain's dl_bw has been checked or
	 * updated. It's monotonously increasing value.
	 *
	 * Also, some corner cases, like 'wrap around' is dangerous, but given
	 * that u64 is 'big enough'. So that shouldn't be a concern.
	 */
	u64 visit_gen;

#ifdef HAVE_RT_PUSH_IPI
	/*
	 * For IPI pull requests, loop across the rto_mask.
	 */
	struct irq_work		rto_push_work;
	raw_spinlock_t		rto_lock;
	/* These are only updated and read within rto_lock */
	int			rto_loop;
	int			rto_cpu;
	/* These atomics are updated outside of a lock */
	atomic_t		rto_loop_next;
	atomic_t		rto_loop_start;
#endif
	/*
	 * The "RT overload" flag: it gets set if a CPU has more than
	 * one runnable RT task.
	 */
	cpumask_var_t		rto_mask;
	struct cpupri		cpupri;

	unsigned long		max_cpu_capacity;

	/*
	 * NULL-terminated list of performance domains intersecting with the
	 * CPUs of the rd. Protected by RCU.
	 */
	struct perf_domain __rcu *pd;
};

sched_group

struct sched_group {
	struct sched_group	*next;			/* Must be a circular list */
	atomic_t		ref;

	unsigned int		group_weight;
	struct sched_group_capacity *sgc;
	int			asym_prefer_cpu;	/* CPU of highest priority in group */
	int			flags;

	/*
	 * The CPUs this group covers.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long		cpumask[];
};

sched_domain

struct sched_domain {
	/* These fields must be setup */
	struct sched_domain __rcu *parent;	/* top domain must be null terminated */
	struct sched_domain __rcu *child;	/* bottom domain must be null terminated */
	struct sched_group *groups;	/* the balancing groups of the domain */
	unsigned long min_interval;	/* Minimum balance interval ms */
	unsigned long max_interval;	/* Maximum balance interval ms */
	unsigned int busy_factor;	/* less balancing by factor if busy */
	unsigned int imbalance_pct;	/* No balance until over watermark */
	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
	unsigned int imb_numa_nr;	/* Nr running tasks that allows a NUMA imbalance */

	int nohz_idle;			/* NOHZ IDLE status */
	int flags;			/* See SD_* */
	int level;

	/* Runtime fields. */
	unsigned long last_balance;	/* init to jiffies. units in jiffies */
	unsigned int balance_interval;	/* initialise to 1. units in ms. */
	unsigned int nr_balance_failed; /* initialise to 0 */

	/* idle_balance() stats */
	u64 max_newidle_lb_cost;
	unsigned long last_decay_max_lb_cost;

	u64 avg_scan_cost;		/* select_idle_sibling */

#ifdef CONFIG_SCHEDSTATS
	/* load_balance() stats */
	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

	/* Active load balancing */
	unsigned int alb_count;
	unsigned int alb_failed;
	unsigned int alb_pushed;

	/* SD_BALANCE_EXEC stats */
	unsigned int sbe_count;
	unsigned int sbe_balanced;
	unsigned int sbe_pushed;

	/* SD_BALANCE_FORK stats */
	unsigned int sbf_count;
	unsigned int sbf_balanced;
	unsigned int sbf_pushed;

	/* try_to_wake_up() stats */
	unsigned int ttwu_wake_remote;
	unsigned int ttwu_move_affine;
	unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
	char *name;
#endif
	union {
		void *private;		/* used during construction */
		struct rcu_head rcu;	/* used during destruction */
	};
	struct sched_domain_shared *shared;

	unsigned int span_weight;
	/*
	 * Span of all CPUs in this domain.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long span[];
};

cpumask

start_kernel
- setup_arch
  - smp_init_cpus :
    - of_parse_and_init_cpus
    - acpi_parse_and_init_cpus
    - smp_cpu_setup
      - set_cpu_possible
        
        cpumask_set_cpu(cpu, &__cpu_possible_mask);
- arch_call_rest_init
  - rest_init
    - kernel_init
      - kernel_init_freeable
        
        smp_prepare_cpus
        
        set_cpu_present : 如果这只是将 possible 拷贝到 present，其意义何在 ?
        
        cpumask_set_cpu(cpu, &__cpu_present_mask);
        
        smp_init
        
        bringup_nonboot_cpus
        
        cpu_up : 实际上，跟丢了

SMT : L1 高速共享 MC : 共享 LLC SOC : DIE

config SCHED_SMT
    bool "SMT (Hyperthreading) scheduler support"
    depends on SPARC64 && SMP
    default y
    help
      SMT scheduler support improves the CPU scheduler's decision making
      when dealing with SPARC cpus at a cost of slightly increased overhead
      in some places. If unsure say N here.

config SCHED_MC
    bool "Multi-core scheduler support"
    depends on SPARC64 && SMP
    default y
    help
      Multi-core scheduler support improves the CPU scheduler's decision
      making when dealing with multi-core CPU chips at a cost of slightly
      increased overhead in some places. If unsure say N here.

/*
 * Topology list, bottom-up.
 */
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
    { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
    { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
    { cpu_cpu_mask, SD_INIT_NAME(DIE) },
    { NULL, },
};

typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);

struct sched_domain_topology_level {
    sched_domain_mask_f mask;      // 返回某个 cpu 在该 topology level 下的 CPU 的兄弟 cpu 的 mask
    sched_domain_flags_f sd_flags; // 用于返回 domain 的属性
    int         flags;
    int         numa_level;
    struct sd_data      data;
};

struct sd_data {
    struct sched_domain *__percpu *sd; // 优秀啊，每一个 cpu 都保存一份所有人的 sched_domain
    struct sched_domain_shared *__percpu *sds;
    struct sched_group *__percpu *sg;
    struct sched_group_capacity *__percpu *sgc;
};

在 sched_domain 被划分为 sched_group, sched_group 是调度最小单位。
- 感觉这么定义的话，岂不是下一级的 sched_domain 就上级的 sched_group
sched_domain_span 表示 cpu 当前的 domain 管辖的 cpu 范围
sched_init_domains
- build_sched_domains
- __visit_domain_allocation_hell
  - __sdt_alloc
  - alloc_rootdomain
- build_sched_domain
  - sd_init
- build_sched_groups
构建 domain 的结果是，在每一个 topology level 中间都存在 NR_cpu 个 sched_domain
- 这个 sched_domain 包含一定数量的 cpu
- sched_domain 指向一个链表的 sched_group

TODO

cpu_attach_domain
sched_group_span

load_balance_mask

  struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

smp balance

__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
    open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON
    nohz.next_balance = jiffies;
    nohz.next_blocked = jiffies;
    zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */

}

enum migration_type {
    migrate_load = 0,
    migrate_util,
    migrate_task,
    migrate_misfit
};

/*
 * 'group_type' describes the group of CPUs at the moment of load balancing.
 *
 * The enum is ordered by pulling priority, with the group with lowest priority
 * first so the group_type can simply be compared when selecting the busiest
 * group. See update_sd_pick_busiest().
 */
enum group_type {
    /* The group has spare capacity that can be used to run more tasks.  */
    group_has_spare = 0,
    /*
     * The group is fully used and the tasks don't compete for more CPU
     * cycles. Nevertheless, some tasks might wait before running.
     */
    group_fully_busy,
    /*
     * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
     * and must be migrated to a more powerful CPU.
     */
    group_misfit_task,
    /*
     * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
     * and the task should be migrated to it instead of running on the
     * current CPU.
     */
    group_asym_packing,
    /*
     * The tasks' affinity constraints previously prevented the scheduler
     * from balancing the load across the system.
     */
    group_imbalanced,
    /*
     * The CPU is overloaded and can't provide expected CPU cycles to all
     * tasks.
     */
    group_overloaded
};

migration 和 group_type 的关系是什么 ?
run_rebalance_domains
- - trigger_load_balance
- - nohz_csd_func
- nohz_csd_func
- update_blocked_averages
- rebalance_domains
  - should_we_balance
  - find_busiest_group
    - update_sd_lb_stats
      - update_group_capacity
      - update_sg_lb_stats
      - update_sd_pick_busiest
    - calculate_imbalance : 计算需要迁移多少负载量才能达到均衡
  - find_busiest_queue
  - stop_one_cpu_nowait
  - detach_tasks : 注意，我们只是迁移没有运行的 cpu
    - detach_task
      - deactivate_task
      - set_task_cpu
there are two cpumask
- 应该是一个描述这个 sched_group 包含那些 cpu，一个描述那些做 load balance 的 cpu
- struct sched_group_capacity::cpumask
- struct sched_group::cpumask
select_task_rq
- - sched_exec
- - select_task_rq
  - - wake_up_new_task
  - - try_to_wake_up
- select_task_rq_fair
  - wake_affine
    - wake_affine_idle
    - wake_affine_weight
  - select_idle_sibling : 快速路径，首先在附近进行查找
    - available_idle_cpu
    - sched_idle_cpu
    - select_idle_core
    - select_idle_cpu
    - select_idle_smt
  - find_idlest_cpu : 慢速路径，是在不行就在全局查找
    - find_idlest_group
    - find_idlest_group_cpu
SD_WAKE_AFFINE 标志位 : 表示运行唤醒进程的 CPU 可以运行这个被唤醒的进程。

先搞清楚一下 CPU 的物理结构吧

https://en.wikichip.org/wiki/amd/microarchitectures/zen
https://www.youtube.com/watch?v=qq13-xxAVDE : 这种类似的科普应该不少吧

cluster

https://www.hikunpeng.com/doc_center/source/zh/kunpengcpfs/systuningguide/systemtg/kunpengcluster_05_0007.html

这两个应该关注下:

cat /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list
echo 1 > /proc/sys/kernel/sched_cluster

看一个有趣的例子

8 thread 的机器上 make -j4 ，top 显示并不是 4 个 core 跑满，而是每一个 50% 。

top - 23:27:27 up 18:20,  2 users,  load average: 4.66, 2.62, 2.77
Tasks: 312 total,   5 running, 307 sleeping,   0 stopped,   0 zombie
%Cpu0  : 26.3 us,  6.2 sy,  0.0 ni, 66.8 id,  0.0 wa,  0.3 hi,  0.3 si,  0.0 st
%Cpu1  : 20.6 us,  5.0 sy,  0.0 ni, 74.4 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu2  : 28.1 us,  5.3 sy,  0.0 ni, 66.6 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu3  : 26.2 us,  5.0 sy,  0.0 ni, 68.5 id,  0.0 wa,  0.0 hi,  0.3 si,  0.0 st
%Cpu4  : 63.3 us,  2.0 sy,  0.0 ni, 34.7 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu5  : 70.7 us,  2.0 sy,  0.0 ni, 27.0 id,  0.0 wa,  0.3 hi,  0.0 si,  0.0 st
%Cpu6  : 68.5 us,  2.6 sy,  0.0 ni, 28.5 id,  0.0 wa,  0.3 hi,  0.0 si,  0.0 st
%Cpu7  : 67.7 us,  2.3 sy,  0.0 ni, 29.7 id,  0.0 wa,  0.3 hi,  0.0 si,  0.0 st
MiB Mem :  15635.1 total,    655.2 free,   2410.4 used,  13839.3 buff/cache
MiB Swap:  16384.0 total,  16261.0 free,    123.0 used.  13224.7 avail Mem

    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
 145978 martins3  20   0  210176  93696  64512 R   4.7   0.6   0:00.14 clang
 143968 martins3  20   0  237904   6144   3072 S   2.0   0.0   0:00.46 make
 145980 martins3  20   0  198448  81920  63488 R   2.0   0.5   0:00.06 clang
 145984 martins3  20   0  199696  73728  54784 R   1.7   0.5   0:00.05 clang
 145985 martins3  20   0  190160  73216  61952 R   1.0   0.5   0:00.03 clang
 145965 martins3  20   0   13968  10400   9728 S   0.7   0.1   0:00.02 clang
 145971 martins3  20   0   13968  10400   9728 S   0.7   0.1   0:00.02 clang
 145976 martins3  20   0   14112  10432   9728 S   0.7   0.1   0:00.02 clang

继续测试，如果在一个 32 thread 的机器上 make -j4 来编译内核，同时用 htop 来观察，可以发现负载其实是较为均匀的分布到各个 logical core 上的，所以，不是说，为了 cache ，会尽量把负载放到一起呢？

本站所有文章转发 CSDN 将按侵权追究法律责任，其它情况随意。