Linux内核源代码情景分析-exit()

执行完/bin/echo之后，会调动do_exit，销毁子进程：

我们还是先从系统调用exit()说起，先来看exit()的实现，进入到内核态执行sys_exit。

asmlinkage long sys_exit(int error_code)
{
	do_exit((error_code&0xff)<<	8);
}

NORET_TYPE void do_exit(long code)
{
	struct task_struct *tsk = current;

	if (in_interrupt())//中断服务不能中断
		panic("Aiee, killing interrupt handler!");
	if (!tsk->pid)//空转idle进程是不允许退出的
		panic("Attempted to kill the idle task!");
	if (tsk->pid == 1)//init进程是不允许退出的
		panic("Attempted to kill init!");
	tsk->flags |= PF_EXITING;
	del_timer_sync(&tsk->real_timer);

fake_volatile:
#ifdef CONFIG_BSD_PROCESS_ACCT
	acct_process(code);
#endif
	__exit_mm(tsk);//如果通过指针共享，只是减少共享计数mm->mm_users。如果自立门户，则释放mm_struct，vm_struct；释放页目录表，页表

	lock_kernel();
	sem_exit();//信号相关，看完进程间通信再说
	__exit_files(tsk);//如果通过指针共享，只是减少共享计数files->count。如果自立门户，那就要释放files_struct数据结构
	__exit_fs(tsk);//如果通过指针共享，只是减少共享计数fs->count。如果自立门户，那就要释放fs_struct数据结构
	exit_sighand(tsk);//如果通过指针共享，只是减少共享计数sig->count。如果自立门户，那就要释放signal_struct数据结构
	exit_thread();//空函数

	if (current->leader)
		disassociate_ctty(1);

	put_exec_domain(tsk->exec_domain);
	if (tsk->binfmt && tsk->binfmt->module)
		__MOD_DEC_USE_COUNT(tsk->binfmt->module);

	tsk->exit_code = code;
	exit_notify();//将当前进程设置为僵死状态；并给父进程发信号；其当前进程的子进程的父进程设置为init进程
	schedule();
	BUG();
/*
 * In order to get rid of the "volatile function does return" message
 * I did this little loop that confuses gcc to think do_exit really
 * is volatile. In fact it's schedule() that is volatile in some
 * circumstances: when current->state = ZOMBIE, schedule() never
 * returns.
 *
 * In fact the natural way to do all this is to have the label and the
 * goto right after each other, but I put the fake_volatile label at
 * the start of the function just in case something /really/ bad
 * happens, and the schedule returns. This way we can try again. I'm
 * not paranoid: it's just that everybody is out to get me.
 */
	goto fake_volatile;
}

__exit_mm，子进程自立门户，释放mm_struct，vm_struct；释放页目录表，页表：

static inline void __exit_mm(struct task_struct * tsk)
{
	struct mm_struct * mm = tsk->mm;

	mm_release();
	if (mm) {
		atomic_inc(&mm->mm_count);
		if (mm != tsk->active_mm) BUG();
		/* more a memory barrier than a real lock */
		task_lock(tsk);
		tsk->mm = NULL;
		task_unlock(tsk);
		enter_lazy_tlb(mm, current, smp_processor_id());
		mmput(mm);//主要是这句
	}
}

void mmput(struct mm_struct *mm)
{
	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {//mm->mm_users为1
		list_del(&mm->mmlist);
		spin_unlock(&mmlist_lock);
		exit_mmap(mm);//释放vm_struct，并把页目录表项和页表项都清0
		mmdrop(mm);//释放mm_struct和页目录表，页表
	}
}

void exit_mmap(struct mm_struct * mm)
{
	struct vm_area_struct * mpnt;

	release_segments(mm);
	spin_lock(&mm->page_table_lock);
	mpnt = mm->mmap;
	mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
	spin_unlock(&mm->page_table_lock);
	mm->rss = 0;
	mm->total_vm = 0;
	mm->locked_vm = 0;
	while (mpnt) {
		struct vm_area_struct * next = mpnt->vm_next;
		unsigned long start = mpnt->vm_start;
		unsigned long end = mpnt->vm_end;
		unsigned long size = end - start;

		if (mpnt->vm_ops) {
			if (mpnt->vm_ops->close)
				mpnt->vm_ops->close(mpnt);
		}
		mm->map_count--;
		remove_shared_vm_struct(mpnt);
		flush_cache_range(mm, start, end);
		zap_page_range(mm, start, size);
		if (mpnt->vm_file)
			fput(mpnt->vm_file);
		kmem_cache_free(vm_area_cachep, mpnt);
		mpnt = next;
	}

	/* This is just debugging */
	if (mm->map_count)
		printk("exit_mmap: map count is %d\n", mm->map_count);

	clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
}

static inline void mmdrop(struct mm_struct * mm)
{
	if (atomic_dec_and_test(&mm->mm_count))
		__mmdrop(mm);
}

inline void __mmdrop(struct mm_struct *mm)
{
	if (mm == &init_mm) BUG();
	pgd_free(mm->pgd);
	destroy_context(mm);
	free_mm(mm);
}

回到do_exit，继续执行__exit_files，子进程自立门户，释放files_struct数据结构，代码如下：

static inline void __exit_files(struct task_struct *tsk)
{
	struct files_struct * files = tsk->files;

	if (files) {
		task_lock(tsk);
		tsk->files = NULL;
		task_unlock(tsk);
		put_files_struct(files);
	}
}

void put_files_struct(struct files_struct *files)
{
	if (atomic_dec_and_test(&files->count)) {//files->count为1
		close_files(files);
		/*
		 * Free the fd and fdset arrays if we expanded them.
		 */
		if (files->fd != &files->fd_array[0])
			free_fd_array(files->fd, files->max_fds);
		if (files->max_fdset > __FD_SETSIZE) {
			free_fdset(files->open_fds, files->max_fdset);
			free_fdset(files->close_on_exec, files->max_fdset);
		}
		kmem_cache_free(files_cachep, files);
	}
}

继续执行__exit_fs，子进程自立门户，那就要释放fs_struct数据结构，代码如下：

static inline void __exit_fs(struct task_struct *tsk)
{
	struct fs_struct * fs = tsk->fs;

	if (fs) {
		task_lock(tsk);
		tsk->fs = NULL;
		task_unlock(tsk);
		__put_fs_struct(fs);
	}
}

static inline void __put_fs_struct(struct fs_struct *fs)
{
	/* No need to hold fs->lock if we are killing it */
	if (atomic_dec_and_test(&fs->count)) {//fs->count为1
		dput(fs->root);
		mntput(fs->rootmnt);
		dput(fs->pwd);
		mntput(fs->pwdmnt);
		if (fs->altroot) {
			dput(fs->altroot);
			mntput(fs->altrootmnt);
		}
		kmem_cache_free(fs_cachep, fs);
	}
}

继续执行exit_sighand，子进程自立门户，那就要释放signal_struct数据结构，代码如下：

void exit_sighand(struct task_struct *tsk)
{
	struct signal_struct * sig = tsk->sig;

	spin_lock_irq(&tsk->sigmask_lock);
	if (sig) {
		tsk->sig = NULL;
		if (atomic_dec_and_test(&sig->count))//sig->count为1
			kmem_cache_free(sigact_cachep, sig);
	}
	tsk->sigpending = 0;
	flush_sigqueue(&tsk->pending);
	spin_unlock_irq(&tsk->sigmask_lock);
}

继续执行exit_notify，当前进程设置为僵死状态；并给父进程发信号；其当前进程的子进程的父进程设置为init进程，代码如下：

static void exit_notify(void)
{
	struct task_struct * p, *t;

	forget_original_parent(current);//其当前进程的子进程的父进程设置为init进程
	/*
	 * Check to see if any process groups have become orphaned
	 * as a result of our exiting, and if they have any stopped
	 * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
	 *
	 * Case i: Our father is in a different pgrp than we are
	 * and we were the only connection outside, so our pgrp
	 * is about to become orphaned.
	 */
	 
	t = current->p_pptr;//通知的是p_pptr，在forget_original_parent设置的是p->p_opptr = reaper
	
	if ((t->pgrp != current->pgrp) &&
	    (t->session == current->session) &&
	    will_become_orphaned_pgrp(current->pgrp, current) &&
	    has_stopped_jobs(current->pgrp)) {
		kill_pg(current->pgrp,SIGHUP,1);
		kill_pg(current->pgrp,SIGCONT,1);
	}

	/* Let father know we died 
	 *
	 * Thread signals are configurable, but you aren't going to use
	 * that to send signals to arbitary processes. 
	 * That stops right now.
	 *
	 * If the parent exec id doesn't match the exec id we saved
	 * when we started then we know the parent has changed security
	 * domain.
	 *
	 * If our self_exec id doesn't match our parent_exec_id then
	 * we have changed execution domain as these two values started
	 * the same after a fork.
	 *	
	 */
	
	if(current->exit_signal != SIGCHLD &&
	    ( current->parent_exec_id != t->self_exec_id  ||
	      current->self_exec_id != current->parent_exec_id) 
	    && !capable(CAP_KILL))
		current->exit_signal = SIGCHLD;//给父进程发的信号是SIGCHLD


	/*
	 * This loop does two things:
	 *
  	 * A.  Make init inherit all the child processes
	 * B.  Check to see if any process groups have become orphaned
	 *	as a result of our exiting, and if they have any stopped
	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
	 */

	write_lock_irq(&tasklist_lock);
	current->state = TASK_ZOMBIE;//当前进程设置为僵死状态
	do_notify_parent(current, current->exit_signal);//给父进程发信号
	while (current->p_cptr != NULL) {
		p = current->p_cptr;
		current->p_cptr = p->p_osptr;
		p->p_ysptr = NULL;
		p->ptrace = 0;

		p->p_pptr = p->p_opptr;//这里，把p_pptr和p_opptr统一了，都是reaper
		p->p_osptr = p->p_pptr->p_cptr;
		if (p->p_osptr)
			p->p_osptr->p_ysptr = p;
		p->p_pptr->p_cptr = p;
		if (p->state == TASK_ZOMBIE)
			do_notify_parent(p, p->exit_signal);
		/*
		 * process group orphan check
		 * Case ii: Our child is in a different pgrp
		 * than we are, and it was the only connection
		 * outside, so the child pgrp is now orphaned.
		 */
		if ((p->pgrp != current->pgrp) &&
		    (p->session == current->session)) {
			int pgrp = p->pgrp;

			write_unlock_irq(&tasklist_lock);
			if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) {
				kill_pg(pgrp,SIGHUP,1);
				kill_pg(pgrp,SIGCONT,1);
			}
			write_lock_irq(&tasklist_lock);
		}
	}
	write_unlock_irq(&tasklist_lock);
}

static inline void forget_original_parent(struct task_struct * father)
{
	struct task_struct * p, *reaper;

	read_lock(&tasklist_lock);

	/* Next in our thread group */
	reaper = next_thread(father);
	if (reaper == father)
		reaper = child_reaper;//init进程

	for_each_task(p) {
		if (p->p_opptr == father) {
			/* We dont want people slaying init */
			p->exit_signal = SIGCHLD;
			p->self_exec_id++;
			p->p_opptr = reaper;//其当前进程的子进程的父进程设置为init进程，这里设置的p_opptr
			if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
		}
	}
	read_unlock(&tasklist_lock);
}

void do_notify_parent(struct task_struct *tsk, int sig)//sig为SIGCHLD
{
	struct siginfo info;
	int why, status;

	info.si_signo = sig;
	info.si_errno = 0;
	info.si_pid = tsk->pid;
	info.si_uid = tsk->uid;

	/* FIXME: find out whether or not this is supposed to be c*time. */
	info.si_utime = tsk->times.tms_utime;
	info.si_stime = tsk->times.tms_stime;

	status = tsk->exit_code & 0x7f;
	why = SI_KERNEL;	/* shouldn't happen */
	switch (tsk->state) {
	case TASK_STOPPED:
		/* FIXME -- can we deduce CLD_TRAPPED or CLD_CONTINUED? */
		if (tsk->ptrace & PT_PTRACED)
			why = CLD_TRAPPED;
		else
			why = CLD_STOPPED;
		break;

	default:
		if (tsk->exit_code & 0x80)
			why = CLD_DUMPED;
		else if (tsk->exit_code & 0x7f)
			why = CLD_KILLED;
		else {
			why = CLD_EXITED;
			status = tsk->exit_code >> 8;
		}
		break;
	}
	info.si_code = why;
	info.si_status = status;

	send_sig_info(sig, &info, tsk->p_pptr);//给父进程发送SIGCHLD信号
	wake_up_parent(tsk->p_pptr);//唤醒父进程，父进程在wait时，将状态设置为TASK_INTERRUPTIBLE，现在设置为TASK_RUNNING
}

至此，进程的基本资源都已经释放了，但是当前进程的残骸仍旧占用着最低限度的资源，包括其task_struct数据结构和系统空间堆栈所在的两个页面。当前进程自己不释放这两个页面，就像人们自己并不在临终注销自己的户口一样，而是通知其父进程，让父进程料理后事。当前进程状态为 TASK_ZOMBIE，schedule时，无限延迟调度该进程。

下面，最后执行schedule，假设只有父进程和子进程，父进程的状态已经是TASK_RUNNING，切换到父进程继续执行。

#define switch_to(prev,next,last) do {					\
	asm volatile("pushl %%esi\n\t"					\ //把esi存入现在进程prev的堆栈
		     "pushl %%edi\n\t"					\ //把edi存入现在进程prev的堆栈
		     "pushl %%ebp\n\t"					\ //把ebp存入现在进程prev的堆栈
		     "movl %%esp,%0\n\t"	/* save ESP */		\ //现在进程prev的esp保存在prev->thread.esp
		     "movl %3,%%esp\n\t"	/* restore ESP */	\ //将要切换的进程next->thread.esp保存在esp中，堆栈已经切换了 
		     "movl $1f,%1\n\t"		/* save EIP */		\ //现在进程prev的eip(也就是"1:\t"地址)保存在prev->thread.eip
		     "pushl %4\n\t"		/* restore EIP */	\ //将要切换的进程next->thread.eip保存在eip中
		     "jmp __switch_to\n"				\ //且不说__switch_to中干了些什么，当CPU执行到那里的ret指令时，由于是通过jmp指令转过去的，最后进入堆栈的next->thread.eip就变成了返回地址
		     "1:\t"						\ //如果切换的不是子进程，next->thread.eip实际上就是上一次保存在prev->thread.eip，也就是这一行语句
		     "popl %%ebp\n\t"					\ //由于堆栈已经切换过来，pop出的都是上面存入进程prev堆栈的内容
		     "popl %%edi\n\t"					\
		     "popl %%esi\n\t"					\
		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
		      "=b" (last)					\
		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
		      "a" (prev), "d" (next),				\
		      "b" (prev));					\
} while (0)

父进程在sys_wait4等待，父进程从"1:\t"继续执行，继续执行 sys_wait4函数。

秒客网

Linux内核源代码情景分析-exit()

相关文章