Skip to the content.

process 的状态 procstat

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */

/* Used in tsk->state: */
#define TASK_RUNNING			0x00000000
#define TASK_INTERRUPTIBLE		0x00000001
#define TASK_UNINTERRUPTIBLE		0x00000002
#define __TASK_STOPPED			0x00000004
#define __TASK_TRACED			0x00000008
/* Used in tsk->exit_state: */
#define EXIT_DEAD			0x00000010
#define EXIT_ZOMBIE			0x00000020
#define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
#define TASK_PARKED			0x00000040
#define TASK_DEAD			0x00000080
#define TASK_WAKEKILL			0x00000100
#define TASK_WAKING			0x00000200
#define TASK_NOLOAD			0x00000400
#define TASK_NEW			0x00000800
#define TASK_RTLOCK_WAIT		0x00001000
#define TASK_FREEZABLE			0x00002000
#define __TASK_FREEZABLE_UNSAFE	       (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
#define TASK_FROZEN			0x00008000
#define TASK_STATE_MAX			0x00010000

#define TASK_ANY			(TASK_STATE_MAX-1)

/*
 * DO NOT ADD ANY NEW USERS !
 */
#define TASK_FREEZABLE_UNSAFE		(TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED			(TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED			__TASK_TRACED

#define TASK_IDLE			(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL			(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

/* get_task_state(): */
#define TASK_REPORT			(TASK_RUNNING | TASK_INTERRUPTIBLE | \
					 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
					 TASK_PARKED)

TASK_UNINTERRUPTIBLE TASK_INTERRUPTIBLE

/*
 * Wait for writeback on an inode to complete. Called with i_lock held.
 * Caller must make sure inode cannot go away when we drop i_lock.
 */
static void __inode_wait_for_writeback(struct inode *inode)
	__releases(inode->i_lock)
	__acquires(inode->i_lock)
{
	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
	wait_queue_head_t *wqh;

	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
	while (inode->i_state & I_SYNC) {
		spin_unlock(&inode->i_lock);
		__wait_on_bit(wqh, &wq, bit_wait,
			      TASK_UNINTERRUPTIBLE);
		spin_lock(&inode->i_lock);
	}
}

这就是等待 io 总是在 uninterrupted 的证据。

TASK_PARKED

主要是给 kthread 使用,处于该状态的时候,表示 __kthread_parkme,其中一直睡眠,然后等待其他人唤醒。

TASK_FROZEN TASK_FREEZABLE

似乎是 freeze 采用的:

TASK_NEW

sched_fork 中赋值

#0  sched_fork (clone_flags=clone_flags@entry=18874368, p=p@entry=0xffff888108f7b480) at kernel/sched/core.c:4690
#1  0xffffffff8113e01c in copy_process (pid=pid@entry=0x0 <fixed_percpu_data>, trace=trace@entry=0, node=node@entry=-1, args=args@entry=0xffffc90002297eb0) at kernel/fork.c:2477
#2  0xffffffff8113f4df in kernel_clone (args=args@entry=0xffffc90002297eb0) at kernel/fork.c:2918
#3  0xffffffff8113f966 in __do_sys_clone (clone_flags=<optimized out>, newsp=<optimized out>, parent_tidptr=<optimized out>, child_tidptr=<optimized out>, tls=<optimized out>) at kernel/fork.c:3061
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_change_group_fair(struct task_struct *p)
{
	/*
	 * We couldn't detach or attach a forked task which
	 * hasn't been woken up by wake_up_new_task().
	 */
	if (READ_ONCE(p->__state) == TASK_NEW)
		return;

	detach_task_cfs_rq(p);

#ifdef CONFIG_SMP
	/* Tell se's cfs_rq has been changed -- migrated */
	p->se.avg.last_update_time = 0;
#endif
	set_task_rq(p, task_cpu(p));
	attach_task_cfs_rq(p);
}
#0  wake_up_new_task (p=p@entry=0xffff888108f7b480) at kernel/sched/core.c:4810
#1  0xffffffff8113f53b in kernel_clone (args=args@entry=0xffffc90002297eb0) at kernel/fork.c:2949
#2  0xffffffff8113f966 in __do_sys_clone (clone_flags=<optimized out>, newsp=<optimized out>, parent_tidptr=<optimized out>, child_tidptr=<optimized out>, tls=<optimized out>) at kernel/fork.c:3061

好吧,需要更多的背景知识。

TASK_NOLOAD

/*
 * Like schedule_timeout_uninterruptible(), except this task will not contribute
 * to load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
	__set_current_state(TASK_IDLE);
	return schedule_timeout(timeout);
}

等待 io 的时候也是计入到 LOAD 中的,所以

TASK_WAKING

	/*
	 * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
	 * == 0), which means we need to do an enqueue, change p->state to
	 * TASK_WAKING such that we can unlock p->pi_lock before doing the
	 * enqueue, such as ttwu_queue_wakelist().
	 */
	WRITE_ONCE(p->__state, TASK_WAKING);

TASK_WAKEKILL && TASK_KILLABLE

Like most versions of Unix, Linux has two fundamental ways in which a process can be put to sleep. A process which is placed in the TASK_INTERRUPTIBLE state will sleep until either (1) something explicitly wakes it up, or (2) a non-masked signal is received. The TASK_UNINTERRUPTIBLE state, instead, ignores signals; processes in that state will require an explicit wakeup before they can run again.

Kernel code which uses interruptible sleeps must always check to see whether it woke up as a result of a signal, and, if so, clean up whatever it was doing and return -EINTR back to user space. The user-space side, too, must realize that a system call was interrupted and respond accordingly; not all user-space programmers are known for their diligence in this regard. Making a sleep uninterruptible eliminates these problems, but at the cost of being, well, uninterruptible. If the expected wakeup event does not materialize, the process will wait forever and there is usually nothing that anybody can do about it short of rebooting the system. This is the source of the dreaded, unkillable process which is shown to be in the “D” state by ps.

这个总结的太好了。

所以这个组合非常合理,拒绝信号,但是除了 kill 信号

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)

__TASK_STOPPED

程序接受到了 stop 信号。

top 的各种妙用

  1. 观察各个 thread 的 CPU 利用率 top -H -p $pid
    • 不要用 pidstat ,那个不准的

理解 ps, top 以及 htop 关于 process state 的输出

首先,他们的编码都是相同的,具体的含义为:

ps 的:

PROCESS STATE CODES
       Here are the different values that the s, stat and state output specifiers (header "STAT" or "S") will display to describe the
       state of a process:

               D    uninterruptible sleep (usually IO)
               I    Idle kernel thread
               R    running or runnable (on run queue)
               S    interruptible sleep (waiting for an event to complete)
               T    stopped by job control signal
               t    stopped by debugger during the tracing
               Z    defunct ("zombie") process, terminated but not reaped by its parent

其中关于 T 和 t 的差别:

sleep 被 ctrl z 之后

0 T martins3   89366   89218  0  80   0 -  1380 do_sig 08:30 pts/3    00:00:00 sleep 1000

如果被 gdb 调试

0 t martins3   91061   91051  0  80   0 -   628 ptrace 08:32 pts/3    00:00:00 /tmp/a.out

也就是这两个的区别:

#define JOBCTL_STOPPED_BIT	26	/* do_signal_stop() */
#define JOBCTL_TRACED_BIT	27	/* ptrace_stop() */

问题

#define TASK_IDLE			(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

为什么 TASK_UNINTERRUPTIBLE 和 TASK_NOLOAD 组合起来叫做 TASK_IDLE 或者说,为什么 TASK_IDLE 非要 TASK_UNINTERRUPTIBLE

[ ] 阅读一下这个: The Unix process API is unreliable and unsafe

https://news.ycombinator.com/item?id=35264487

2.1 Process State

As its name implies, the state field of the process descriptor describes what is currently happening to the process. It consists of an array of flags, each of which describes a possible process state. In the current Linux version, these states are mutually exclusive, and hence exactly one flag of state always is set; the remaining flags are cleared. The following are the possible process states:

The process is either executing on a CPU or waiting to be executed.

The process is suspended (sleeping) until some condition becomes true. Raising a hardware interrupt, releasing a system resource the process is waiting for, or delivering a signal are examples of conditions that might wake up the process (put its state back to TASK_RUNNING).

Like TASK_INTERRUPTIBLE, except that delivering a signal to the sleeping process leaves its state unchanged. This process state is seldom used. It is valuable, however, under certain specific conditions in which a process must wait until a given event occurs without being interrupted. For instance, this state may be used when a process opens a device file and the corresponding device driver starts probing for a corresponding hardware device. The device driver must not be interrupted until the probing is complete, or the hardware device could be left in an unpredictable state.

Process execution has been stopped; the process enters this state after receiving a SIGSTOP, SIGTSTP, SIGTTIN, or SIGTTOU signal.

Process execution has been stopped by a debugger. When a process is being monitored by another (such as when a debugger executes a ptrace() system call to monitor a test program), each signal may put the process in the TASK_TRACED state.

Two additional states of the process can be stored both in the state field and in the exit_state field of the process descriptor; as the field name suggests, a process reaches one of these two states only when its execution is terminated:

Process execution is terminated, but the parent process has not yet issued a wait4() or waitpid() system call to return information about the dead process. Before the wait()-like call is issued, the kernel cannot discard the data contained in the dead process descriptor because the parent might need it. (See the section “Process Removal” near the end of this chapter.)

The final state: the process is being removed by the system because the parent process has just issued a wait4() or waitpid() system call for it. Changing its state from EXIT_ZOMBIE to EXIT_DEAD avoids race conditions due to other threads of execution that execute wait()-like calls on the same process (see Chapter 5).

@todo 似乎的确是这个 flag, 但是实际上存在更多的 flags

/* Used in tsk->state: */
#define TASK_RUNNING			0x0000
#define TASK_INTERRUPTIBLE		0x0001
#define TASK_UNINTERRUPTIBLE		0x0002
#define __TASK_STOPPED			0x0004
#define __TASK_TRACED			0x0008
/* Used in tsk->exit_state: */
#define EXIT_DEAD			0x0010
#define EXIT_ZOMBIE			0x0020
#define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
#define TASK_PARKED			0x0040
#define TASK_DEAD			0x0080
#define TASK_WAKEKILL			0x0100
#define TASK_WAKING			0x0200
#define TASK_NOLOAD			0x0400
#define TASK_NEW			0x0800
#define TASK_STATE_MAX			0x1000

(process) task 不可以被 interruptible 是什么意思 ? 当其运行的时候,屏蔽所有的中断吗 ?

/* set task’s state to interruptible sleep */
set_current_state(TASK_INTERRUPTIBLE)

会看 ps -elf 的结果都是很不错的

例如,ps -elf 可以看一个程序到底用了多少 CPU 时间

这个整理到 放到 process state 中去

kthread 休眠的时候的状态是 I

0 S martins3  389867    4921  0  80   0 -   469 wait_w 21:55 pts/5    00:00:00 -zsh
1 I root      390410       2  0  80   0 -     0 -      21:56 ?        00:00:00 [kworker/5:1-mm_percpu_wq]
1 I root      390620       2  0  80   0 -     0 -      22:00 ?        00:00:00 [kworker/u193:3-hclge]
1 I root      390691       2  0  80   0 -     0 -      22:02 ?        00:00:00 [kworker/u194:2-events_unbound]
1 I root      390949       2  0  80   0 -     0 -      22:03 ?        00:00:00 [kworker/u192:1-hinic3_nic_dev_wq]
1 I root      390963       2  0  80   0 -     0 -      22:03 ?        00:00:00 [kworker/7:1-mm_percpu_wq]
1 I root      390984       2  0  80   0 -     0 -      22:03 ?        00:00:00 [kworker/10:0]
 cat /proc/self/status
Name:   cat
Umask:  0022
State:  R (running)

一共就是这些状态:

/*
 * The task state array is a strange "bitmap" of
 * reasons to sleep. Thus "running" is zero, and
 * you can test for combinations of others with
 * simple bit tests.
 */
static const char * const task_state_array[] = {

	/* states in TASK_REPORT: */
	"R (running)",		/* 0x00 */
	"S (sleeping)",		/* 0x01 */
	"D (disk sleep)",	/* 0x02 */
	"T (stopped)",		/* 0x04 */
	"t (tracing stop)",	/* 0x08 */
	"X (dead)",		/* 0x10 */
	"Z (zombie)",		/* 0x20 */
	"P (parked)",		/* 0x40 */

	/* states beyond TASK_REPORT: */
	"I (idle)",		/* 0x80 */
};
/* Used in tsk->__state: */
#define TASK_RUNNING			0x00000000
#define TASK_INTERRUPTIBLE		0x00000001
#define TASK_UNINTERRUPTIBLE		0x00000002
#define __TASK_STOPPED			0x00000004
#define __TASK_TRACED			0x00000008
/* Used in tsk->exit_state: */
#define EXIT_DEAD			0x00000010
#define EXIT_ZOMBIE			0x00000020
#define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->__state again: */
#define TASK_PARKED			0x00000040
#define TASK_DEAD			0x00000080

这些就是可以被 report 的 存在了可以被 report 之外的 status ,那么配置就是 I

#define TASK_REPORT			(TASK_RUNNING | TASK_INTERRUPTIBLE | \
					 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
					 TASK_PARKED)

可以写一个程序测试一下,kthread 额外设置了什么,不过兴趣不大了。

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。