Skip to the content.

balance

select_idle_* 的作用是什么

/*
 * Try and locate an idle core/thread in the LLC cache domain.
 */
static int select_idle_sibling(struct task_struct *p, int prev, int target)

调用位置

/*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 *
 * Balances load by selecting the idlest CPU in the idlest group, or under
 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
 *
 * Returns the target CPU number.
 *
 * preempt must be disabled.
 */
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
  1. task_numa_compare
    • task_numa_find_cpu
      • task_numa_migrate
        • numa_migrate_perfer
          • task_numa_fault : 这个函数调用位置来自于 memory.c huge_memory.c 中的 do_numa_page 之类的函数

select_idle_sibling 中间的部分片段:

	i = select_idle_core(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
		return i;

	i = select_idle_cpu(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
		return i;

	i = select_idle_smt(p, sd, target);
	if ((unsigned)i < nr_cpumask_bits)
		return i;

lb 的关键内容 : find_busiest_group 和 find_busiest_queue

被 load_balance 唯一调用

/*
 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
 */
static struct rq *find_busiest_queue(struct lb_env *env,
				     struct sched_group *group)

在 group 中间的找到 cfs_rq,因为迁移都是在文件夹中间进行迁移的。

task_group 是描述,

/**
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance.
 *
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
 * @env: The load balancing environment.
 *
 * Return:	- The busiest group if imbalance exists.
 */
static struct sched_group *find_busiest_group(struct lb_env *env)

find_busiest_group 的 helper 函数 7000 ~ 8200 。

find_idlest_cpu 和 find_idlest_group : select_task_rq_fair 相关的 @todo 为什么 lb 只需要 busiest 而不需要 idlest 的

load_balance 的实现

kernel/sched/fair.c:6798 的注释

了解一下其中的函数

触发 rebalance 的方法和位置是什么

  1. domain 的概念是什么 ?
idle_balance : schedule() 调用,应该是最容易分析了
rebalance_domains:
  load_balance : 核心业务 ?

_nohz_idle_balance()
run_rebalance_domains : 被注册到 softirq 中间了
  rebalance_domains

nohz_idle_balance(): 被 run_rebalance_domains 唯一调用
nohz_newidle_balance(): 被 idle_balance 唯一调用
  __nohz_idle_balance():


scheduler_tick()
  tigger_load_balance
    nohz_balance_kick() : 值的关注一下,好像和之前的所有的东西都不是一个东西呀!

好像就是 softirq 触发的,然后进行整个流程走一下 以及从 timer 中间触发!

首先,理解了

topology

如何初始化

什么时候切换

大核小核的影响

如何实现 taskset 的效果

关键结构体

root_domain

/*
 * We add the notion of a root-domain which will be used to define per-domain
 * variables. Each exclusive cpuset essentially defines an island domain by
 * fully partitioning the member CPUs from any other cpuset. Whenever a new
 * exclusive cpuset is created, we also create and attach a new root-domain
 * object.
 *
 */
struct root_domain {
	atomic_t		refcount;
	atomic_t		rto_count;
	struct rcu_head		rcu;
	cpumask_var_t		span;
	cpumask_var_t		online;

	/*
	 * Indicate pullable load on at least one CPU, e.g:
	 * - More than one runnable task
	 * - Running task is misfit
	 */
	int			overload;

	/* Indicate one or more cpus over-utilized (tipping point) */
	int			overutilized;

	/*
	 * The bit corresponding to a CPU gets set here if such CPU has more
	 * than one runnable -deadline task (as it is below for RT tasks).
	 */
	cpumask_var_t		dlo_mask;
	atomic_t		dlo_count;
	struct dl_bw		dl_bw;
	struct cpudl		cpudl;

	/*
	 * Indicate whether a root_domain's dl_bw has been checked or
	 * updated. It's monotonously increasing value.
	 *
	 * Also, some corner cases, like 'wrap around' is dangerous, but given
	 * that u64 is 'big enough'. So that shouldn't be a concern.
	 */
	u64 visit_gen;

#ifdef HAVE_RT_PUSH_IPI
	/*
	 * For IPI pull requests, loop across the rto_mask.
	 */
	struct irq_work		rto_push_work;
	raw_spinlock_t		rto_lock;
	/* These are only updated and read within rto_lock */
	int			rto_loop;
	int			rto_cpu;
	/* These atomics are updated outside of a lock */
	atomic_t		rto_loop_next;
	atomic_t		rto_loop_start;
#endif
	/*
	 * The "RT overload" flag: it gets set if a CPU has more than
	 * one runnable RT task.
	 */
	cpumask_var_t		rto_mask;
	struct cpupri		cpupri;

	unsigned long		max_cpu_capacity;

	/*
	 * NULL-terminated list of performance domains intersecting with the
	 * CPUs of the rd. Protected by RCU.
	 */
	struct perf_domain __rcu *pd;
};

sched_group

struct sched_group {
	struct sched_group	*next;			/* Must be a circular list */
	atomic_t		ref;

	unsigned int		group_weight;
	struct sched_group_capacity *sgc;
	int			asym_prefer_cpu;	/* CPU of highest priority in group */
	int			flags;

	/*
	 * The CPUs this group covers.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long		cpumask[];
};

sched_domain

struct sched_domain {
	/* These fields must be setup */
	struct sched_domain __rcu *parent;	/* top domain must be null terminated */
	struct sched_domain __rcu *child;	/* bottom domain must be null terminated */
	struct sched_group *groups;	/* the balancing groups of the domain */
	unsigned long min_interval;	/* Minimum balance interval ms */
	unsigned long max_interval;	/* Maximum balance interval ms */
	unsigned int busy_factor;	/* less balancing by factor if busy */
	unsigned int imbalance_pct;	/* No balance until over watermark */
	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
	unsigned int imb_numa_nr;	/* Nr running tasks that allows a NUMA imbalance */

	int nohz_idle;			/* NOHZ IDLE status */
	int flags;			/* See SD_* */
	int level;

	/* Runtime fields. */
	unsigned long last_balance;	/* init to jiffies. units in jiffies */
	unsigned int balance_interval;	/* initialise to 1. units in ms. */
	unsigned int nr_balance_failed; /* initialise to 0 */

	/* idle_balance() stats */
	u64 max_newidle_lb_cost;
	unsigned long last_decay_max_lb_cost;

	u64 avg_scan_cost;		/* select_idle_sibling */

#ifdef CONFIG_SCHEDSTATS
	/* load_balance() stats */
	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

	/* Active load balancing */
	unsigned int alb_count;
	unsigned int alb_failed;
	unsigned int alb_pushed;

	/* SD_BALANCE_EXEC stats */
	unsigned int sbe_count;
	unsigned int sbe_balanced;
	unsigned int sbe_pushed;

	/* SD_BALANCE_FORK stats */
	unsigned int sbf_count;
	unsigned int sbf_balanced;
	unsigned int sbf_pushed;

	/* try_to_wake_up() stats */
	unsigned int ttwu_wake_remote;
	unsigned int ttwu_move_affine;
	unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
	char *name;
#endif
	union {
		void *private;		/* used during construction */
		struct rcu_head rcu;	/* used during destruction */
	};
	struct sched_domain_shared *shared;

	unsigned int span_weight;
	/*
	 * Span of all CPUs in this domain.
	 *
	 * NOTE: this field is variable length. (Allocated dynamically
	 * by attaching extra space to the end of the structure,
	 * depending on how many CPUs the kernel has booted up with)
	 */
	unsigned long span[];
};

cpumask

SMT : L1 高速共享 MC : 共享 LLC SOC : DIE

config SCHED_SMT
    bool "SMT (Hyperthreading) scheduler support"
    depends on SPARC64 && SMP
    default y
    help
      SMT scheduler support improves the CPU scheduler's decision making
      when dealing with SPARC cpus at a cost of slightly increased overhead
      in some places. If unsure say N here.

config SCHED_MC
    bool "Multi-core scheduler support"
    depends on SPARC64 && SMP
    default y
    help
      Multi-core scheduler support improves the CPU scheduler's decision
      making when dealing with multi-core CPU chips at a cost of slightly
      increased overhead in some places. If unsure say N here.
/*
 * Topology list, bottom-up.
 */
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
    { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
    { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
    { cpu_cpu_mask, SD_INIT_NAME(DIE) },
    { NULL, },
};

typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);

struct sched_domain_topology_level {
    sched_domain_mask_f mask;      // 返回某个 cpu 在该 topology level 下的 CPU 的兄弟 cpu 的 mask
    sched_domain_flags_f sd_flags; // 用于返回 domain 的属性
    int         flags;
    int         numa_level;
    struct sd_data      data;
};

struct sd_data {
    struct sched_domain *__percpu *sd; // 优秀啊,每一个 cpu 都保存一份所有人的 sched_domain
    struct sched_domain_shared *__percpu *sds;
    struct sched_group *__percpu *sg;
    struct sched_group_capacity *__percpu *sgc;
};

TODO

smp balance

__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
    open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON
    nohz.next_balance = jiffies;
    nohz.next_blocked = jiffies;
    zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */

}
enum migration_type {
    migrate_load = 0,
    migrate_util,
    migrate_task,
    migrate_misfit
};

/*
 * 'group_type' describes the group of CPUs at the moment of load balancing.
 *
 * The enum is ordered by pulling priority, with the group with lowest priority
 * first so the group_type can simply be compared when selecting the busiest
 * group. See update_sd_pick_busiest().
 */
enum group_type {
    /* The group has spare capacity that can be used to run more tasks.  */
    group_has_spare = 0,
    /*
     * The group is fully used and the tasks don't compete for more CPU
     * cycles. Nevertheless, some tasks might wait before running.
     */
    group_fully_busy,
    /*
     * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
     * and must be migrated to a more powerful CPU.
     */
    group_misfit_task,
    /*
     * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
     * and the task should be migrated to it instead of running on the
     * current CPU.
     */
    group_asym_packing,
    /*
     * The tasks' affinity constraints previously prevented the scheduler
     * from balancing the load across the system.
     */
    group_imbalanced,
    /*
     * The CPU is overloaded and can't provide expected CPU cycles to all
     * tasks.
     */
    group_overloaded
};

先搞清楚一下 CPU 的物理结构吧

cluster

https://www.hikunpeng.com/doc_center/source/zh/kunpengcpfs/systuningguide/systemtg/kunpengcluster_05_0007.html

这两个应该关注下:

cat /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list
echo 1 > /proc/sys/kernel/sched_cluster

看一个有趣的例子

8 thread 的机器上 make -j4 ,top 显示并不是 4 个 core 跑满,而是每一个 50% 。

top - 23:27:27 up 18:20,  2 users,  load average: 4.66, 2.62, 2.77
Tasks: 312 total,   5 running, 307 sleeping,   0 stopped,   0 zombie
%Cpu0  : 26.3 us,  6.2 sy,  0.0 ni, 66.8 id,  0.0 wa,  0.3 hi,  0.3 si,  0.0 st
%Cpu1  : 20.6 us,  5.0 sy,  0.0 ni, 74.4 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu2  : 28.1 us,  5.3 sy,  0.0 ni, 66.6 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu3  : 26.2 us,  5.0 sy,  0.0 ni, 68.5 id,  0.0 wa,  0.0 hi,  0.3 si,  0.0 st
%Cpu4  : 63.3 us,  2.0 sy,  0.0 ni, 34.7 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu5  : 70.7 us,  2.0 sy,  0.0 ni, 27.0 id,  0.0 wa,  0.3 hi,  0.0 si,  0.0 st
%Cpu6  : 68.5 us,  2.6 sy,  0.0 ni, 28.5 id,  0.0 wa,  0.3 hi,  0.0 si,  0.0 st
%Cpu7  : 67.7 us,  2.3 sy,  0.0 ni, 29.7 id,  0.0 wa,  0.3 hi,  0.0 si,  0.0 st
MiB Mem :  15635.1 total,    655.2 free,   2410.4 used,  13839.3 buff/cache
MiB Swap:  16384.0 total,  16261.0 free,    123.0 used.  13224.7 avail Mem

    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
 145978 martins3  20   0  210176  93696  64512 R   4.7   0.6   0:00.14 clang
 143968 martins3  20   0  237904   6144   3072 S   2.0   0.0   0:00.46 make
 145980 martins3  20   0  198448  81920  63488 R   2.0   0.5   0:00.06 clang
 145984 martins3  20   0  199696  73728  54784 R   1.7   0.5   0:00.05 clang
 145985 martins3  20   0  190160  73216  61952 R   1.0   0.5   0:00.03 clang
 145965 martins3  20   0   13968  10400   9728 S   0.7   0.1   0:00.02 clang
 145971 martins3  20   0   13968  10400   9728 S   0.7   0.1   0:00.02 clang
 145976 martins3  20   0   14112  10432   9728 S   0.7   0.1   0:00.02 clang

继续测试,如果在一个 32 thread 的机器上 make -j4 来编译内核,同时用 htop 来观察, 可以发现负载其实是较为均匀的分布到各个 logical core 上的,所以,不是说,为了 cache ,会尽量把负载放到一起呢?

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。