首页 > *nix技术, 内核技术, 系统管理 > Linux内核进程详解之三:flush-x:y

Linux内核进程详解之三:flush-x:y

2012年2月18日 发表评论 阅读评论 5,963 次浏览

上一篇文章《设备文件与设备号》当然不是突然穿插而来的自言自语,而是理解本文的前提,下面来看。flush-x:y是一类进程,这在系列的上一篇文章里已经讲到过,系统的绝大部分的bdi设备都会有对应的flush-x:y内核进程,而这个x:y是对应bdi设备的设备号。
先看一下系统当前挂载的文件系统:

[root@localhost lenky]# cat /proc/mounts 
rootfs / rootfs rw 0 0
/proc /proc proc rw,relatime 0 0
/sys /sys sysfs rw,seclabel,relatime 0 0
udev /dev devtmpfs rw,seclabel,relatime,size=502568k,nr_inodes=125642,mode=755 0 0
devpts /dev/pts devpts rw,seclabel,relatime,gid=5,mode=620,ptmxmode=000 0 0
tmpfs /dev/shm tmpfs rw,seclabel,relatime 0 0
/dev/mapper/VolGroup-lv_root / ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
none /selinux selinuxfs rw,relatime 0 0
udev /dev devtmpfs rw,seclabel,relatime,size=502568k,nr_inodes=125642,mode=755 0 0
/proc/bus/usb /proc/bus/usb usbfs rw,relatime 0 0
/dev/sda1 /boot ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
/dev/mapper/VolGroup-lv_home /home ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
none /proc/sys/fs/binfmt_misc binfmt_misc rw,relatime 0 0
cgroup /cgroup/cpuset cgroup rw,relatime,cpuset 0 0
cgroup /cgroup/cpu cgroup rw,relatime,cpu 0 0
cgroup /cgroup/cpuacct cgroup rw,relatime,cpuacct 0 0
cgroup /cgroup/memory cgroup rw,relatime,memory 0 0
cgroup /cgroup/devices cgroup rw,relatime,devices 0 0
cgroup /cgroup/freezer cgroup rw,relatime,freezer 0 0
cgroup /cgroup/net_cls cgroup rw,relatime,net_cls 0 0
cgroup /cgroup/blkio cgroup rw,relatime,blkio 0 0
sunrpc /var/lib/nfs/rpc_pipefs rpc_pipefs rw,relatime 0 0
/etc/auto.misc /misc autofs rw,relatime,fd=7,pgrp=1393,timeout=300,minproto=5,maxproto=5,indirect 0 0
-hosts /net autofs rw,relatime,fd=13,pgrp=1393,timeout=300,minproto=5,maxproto=5,indirect 0 0
/dev/sdb1 /home/lenky/sdb/sdb1 ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
/dev/sdc1 /home/lenky/sdc/sdc1 ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
/dev/sdc2 /home/lenky/sdc/sdc2 ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
[root@localhost lenky]# 

注意需要关注的重点:

/dev/mapper/VolGroup-lv_root / ext4
/dev/mapper/VolGroup-lv_home /home ext4
/dev/sdb1 /home/lenky/sdb/sdb1 ext4
/dev/sdc1 /home/lenky/sdc/sdc1 ext4
/dev/sdc2 /home/lenky/sdc/sdc2 ext4

对应的设备号分别为:

[root@localhost lenky]# ls -l /dev/dm-*
brw-rw----. 1 root disk 253, 0 Jan 12 06:24 /dev/dm-0
brw-rw----. 1 root disk 253, 1 Jan 12 06:24 /dev/dm-1
brw-rw----. 1 root disk 253, 2 Jan 12 06:24 /dev/dm-2
[root@localhost lenky]# 
[root@localhost lenky]# ls -l /dev/mapper/*
crw-rw----. 1 root root 10, 236 Jan 12 06:24 /dev/mapper/control
lrwxrwxrwx. 1 root root       7 Jan 12 06:24 /dev/mapper/VolGroup-lv_home -> ../dm-2
lrwxrwxrwx. 1 root root       7 Jan 12 06:24 /dev/mapper/VolGroup-lv_root -> ../dm-0
lrwxrwxrwx. 1 root root       7 Jan 12 06:24 /dev/mapper/VolGroup-lv_swap -> ../dm-1
[root@localhost lenky]# 
[root@localhost lenky]# ls -l /dev/sda*
brw-rw----. 1 root disk 8, 0 Jan 12 06:24 /dev/sda
brw-rw----. 1 root disk 8, 1 Jan 12 06:24 /dev/sda1
brw-rw----. 1 root disk 8, 2 Jan 12 06:24 /dev/sda2
[root@localhost lenky]# 
[root@localhost lenky]# ls -l /dev/sdb*
brw-rw----. 1 root disk 8, 16 Jan 12 06:25 /dev/sdb
brw-rw----. 1 root disk 8, 17 Jan 12 06:25 /dev/sdb1
[root@localhost lenky]# 
[root@localhost lenky]# ls -l /dev/sdc*
brw-rw----. 1 root disk 8, 32 Jan 12 06:29 /dev/sdc
brw-rw----. 1 root disk 8, 33 Jan 12 06:39 /dev/sdc1
brw-rw----. 1 root disk 8, 34 Jan 12 06:29 /dev/sdc2
brw-rw----. 1 root disk 8, 35 Jan 12 06:29 /dev/sdc3
[root@localhost lenky]# 

在任意时刻,我们能看到的flush-x:y内核进程并不固定,原因之前已经说过:

[root@localhost lenky]# ps aux | grep flush-
root      1250  0.0  0.0      0     0 ?        S    06:24   0:00 [flush-253:0]
root      2180  0.0  0.0      0     0 ?        S    06:39   0:00 [flush-253:2]
root      2186  2.0  0.0      0     0 ?        S    06:39   0:07 [flush-8:32]
root      2329  0.0  0.0 103204   800 pts/3    S+   06:45   0:00 grep flush-
[root@localhost lenky]# 

调用sync命令,强制同步操作会创建所有对应的flush-x:y内核进程:

[root@localhost lenky]# sync
[root@localhost lenky]# ps aux | grep flush-
root      1250  0.0  0.0      0     0 ?        S    06:24   0:00 [flush-253:0]
root      2180  0.0  0.0      0     0 ?        S    06:39   0:00 [flush-253:2]
root      2186  2.0  0.0      0     0 ?        S    06:39   0:07 [flush-8:32]
root      2331  0.0  0.0      0     0 ?        S    06:45   0:00 [flush-8:0]
root      2332  0.0  0.0      0     0 ?        S    06:45   0:00 [flush-8:16]
root      2334  0.0  0.0 103204   800 pts/3    S+   06:45   0:00 grep flush-
[root@localhost lenky]# 

可以看到flush-x:y内核进程是对应bdi整个设备的,比如这里的单个磁盘,而不是各个磁盘分区。
最后来看代码,flush-x:y内核进程的主体函数是bdi_writeback_thread(…)

/*
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * wakes up periodically and does kupdated style flushing.
 */
int bdi_writeback_thread(void *data)
{
	struct bdi_writeback *wb = data;
	struct backing_dev_info *bdi = wb->bdi;
	long pages_written;

	current->flags |= PF_SWAPWRITE;
	set_freezable();
	wb->last_active = jiffies;

	/*
	 * Our parent may run at a different priority, just set us to normal
	 */
	set_user_nice(current, 0);

	trace_writeback_thread_start(bdi);

	while (!kthread_should_stop()) {
		/*
		 * Remove own delayed wake-up timer, since we are already awake
		 * and we'll take care of the preriodic write-back.
		 */
		del_timer(&wb->wakeup_timer);

		pages_written = wb_do_writeback(wb, 0);

		trace_writeback_pages_written(pages_written);

		if (pages_written)
			wb->last_active = jiffies;

		set_current_state(TASK_INTERRUPTIBLE);
		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
			__set_current_state(TASK_RUNNING);
			continue;
		}

		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
		else {
			/*
			 * We have nothing to do, so can go sleep without any
			 * timeout and save power. When a work is queued or
			 * something is made dirty - we will be woken up.
			 */
			schedule();
		}

		try_to_freeze();
	}

	/* Flush any work that raced with us exiting */
	if (!list_empty(&bdi->work_list))
		wb_do_writeback(wb, 1);

	trace_writeback_thread_stop(bdi);
	return 0;
}

函数主体是一个while循环,while语句调用一个判断函数决定是否该结束循环:

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
int kthread_should_stop(void)
{
	return to_kthread(current)->should_stop;
}
EXPORT_SYMBOL(kthread_should_stop);

而这个should_stop标记字段会在bdi-default内核进程的KILL_THREAD动作里进行修改(上一篇文章提到过),也就是通过这个字段实现bdi-default内核进程对flush-x:y内核进程的控制:

		case KILL_THREAD:
			__set_current_state(TASK_RUNNING);
			kthread_stop(task);
			break;

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call do_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
	struct kthread *kthread;
	int ret;

	trace_sched_kthread_stop(k);
	get_task_struct(k);

	kthread = to_kthread(k);
	barrier(); /* it might have exited */
	if (k->vfork_done != NULL) {
		kthread->should_stop = 1;
		wake_up_process(k);
		wait_for_completion(&kthread->exited);
	}
	ret = k->exit_code;

	put_task_struct(k);
	trace_sched_kthread_stop_ret(ret);

	return ret;
}
EXPORT_SYMBOL(kthread_stop);

while循环内的工作,除去其它细节,值得关注的主要有三点:第一,修改最后活动时间(语句:wb->last_active = jiffies;),这样bdi-default内核进程才能通过last_active这个字段来判断flush-x:y内核进程的活动状态,如果很久没有活动(比较的就是last_active字段)则把它kill掉;第二,当然就是进程的主要工作,调用函数wb_do_writeback(…)进行同步操作;第三,如果在进行一次同步操作之后,又有新的脏数据需要同步,那么先睡眠,等间隔时间(默认5秒)后超时醒来继续工作;如果已经没有脏数据需要同步,那么直接schedule()调度其它进程,而进程本身进入可中断睡眠状态(注意前面的语句:set_current_state(TASK_INTERRUPTIBLE);),等待后续被唤醒继续工作或被kill掉。
整个bdi-default和flush-x:y内核进程讲完了,为什么会有这样的设计?在这里有很好的说明:http://lwn.net/Articles/396757/,相比以前的多个pdflush间隔醒来,改进之后只需bdi-default一个内核进程间隔醒来就行了,这在电池供电设备上明显比较省电。

转载请保留地址:http://www.lenky.info/archives/2012/02/1138http://lenky.info/?p=1138


备注:如无特殊说明,文章内容均出自Lenky个人的真实理解而并非存心妄自揣测来故意愚人耳目。由于个人水平有限,虽力求内容正确无误,但仍然难免出错,请勿见怪,如果可以则请留言告之,并欢迎来讨论。另外值得说明的是,Lenky的部分文章以及部分内容参考借鉴了网络上各位网友的热心分享,特别是一些带有完全参考的文章,其后附带的链接内容也许更直接、更丰富,而我只是做了一下归纳&转述,在此也一并表示感谢。关于本站的所有技术文章,欢迎转载,但请遵从CC创作共享协议,而一些私人性质较强的心情随笔,建议不要转载。

法律:根据最新颁布的《信息网络传播权保护条例》,如果您认为本文章的任何内容侵犯了您的权利,请以Email或书面等方式告知,本站将及时删除相关内容或链接。

  1. 本文目前尚无任何评论.
  1. 本文目前尚无任何 trackbacks 和 pingbacks.