Linux : task work 机制

时间:2022-08-15 10:14:50

task work机制可以在内核中向指定的进程添加一些任务函数,这些任务函数会在进程返回用户态时执行,使用的是该进程的上下文。包括下面的这些API:

  • task_work_add
  • task_work_cancel
  • task_work_run

进程对象task_struct中有个字段用来存储这些待进行的任务列表头即task_works,这个结构体包含一个next指针和需要执行的函数指针。

205 /**
206 * struct callback_head - callback structure for use with RCU and task_work
207 * @next: next update requests in a list
208 * @func: actual update function to call after the grace period.
209 */
210 struct callback_head {
211 struct callback_head *next;
212 void (*func)(struct callback_head *head);
213 };
  4
5 static struct callback_head work_exited; /* all we need is ->next == NULL */
6
7 /**
8 * task_work_add - ask the @task to execute @work->func()
9 * @task: the task which should run the callback
10 * @work: the callback to run
11 * @notify: send the notification if true
12 *
13 * Queue @work for task_work_run() below and notify the @task if @notify.
14 * Fails if the @task is exiting/exited and thus it can't process this @work.
15 * Otherwise @work->func() will be called when the @task returns from kernel
16 * mode or exits.
17 *
18 * This is like the signal handler which runs in kernel mode, but it doesn't
19 * try to wake up the @task.
20 *
21 * RETURNS:
22 * 0 if succeeds or -ESRCH.
23 */
24 int
25 task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
26 {
27 struct callback_head *head;
28
29 do {
30 head = ACCESS_ONCE(task->task_works);
31 if (unlikely(head == &work_exited))
32 return -ESRCH;
33 work->next = head;
34 } while (cmpxchg(&task->task_works, head, work) != head);
35
36 if (notify)
37 set_notify_resume(task);
38 return 0;
39 }

主要工作:

1. 通过CAS以无锁的形式添加了一个链表元素。(新元素排在原有链表头部)

2. set_notify_resume函数向指定的进程设置了一个_TIF_NOTIFY_RESUME标记。

task_work_run执行时机

在返回用户态之前会对当前进程的标记检查,如果相关标记置位则会调用do_notify_resume

595 int_signal:
596 testl $_TIF_DO_NOTIFY_MASK,%edx
597 jz 1f
598 movq %rsp,%rdi # &ptregs -> arg1
599 xorl %esi,%esi # oldset -> arg2
600 call do_notify_resume
601 1: movl $_TIF_WORK_MASK,%edi
602 int_restore_rest:
603 RESTORE_REST
604 DISABLE_INTERRUPTS(CLBR_NONE)
605 TRACE_IRQS_OFF
606 jmp int_with_check
607 CFI_ENDPROC
608 END(system_call)

以上文件为entry_64.S,而标记定义在thread_info.c中

130 /* work to do on interrupt/exception return */
131 #define _TIF_WORK_MASK \
132 (0x0000FFFF & \
133 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
134 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
 70 #define TIF_SYSCALL_TRACE       0       /* syscall trace active */
71 #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
72 #define TIF_SIGPENDING 2 /* signal pending */
73 #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
74 #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
75 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
76 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
77 #define TIF_SECCOMP 8 /* secure computing */
78 #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
79 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
80 #define TIF_UPROBE 12 /* breakpointed or singlestepping */
81 #define TIF_NOTSC 16 /* TSC is not accessible in userland */
82 #define TIF_IA32 17 /* IA32 compatibility process */
83 #define TIF_FORK 18 /* ret_from_fork */
84 #define TIF_NOHZ 19 /* in adaptive nohz mode */
85 #define TIF_MEMDIE 20 /* is terminating due to OOM killer */
86 #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
87 #define TIF_IO_BITMAP 22 /* uses I/O bitmap */
88 #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
89 #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
90 #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
91 #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
92 #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
93 #define TIF_X32 30 /* 32-bit native x86-64 binary */
94

即_TIF_WORK_MASK表示除开(_TIF_SYSCALL_TRACE, _TIF_SYSCALL_AUDIT, _TIF_SINGLESTEP, _TIF_SECCOMP, _TIF_SYSCALL_EMU)之外的所有标记。自然包括了_TIF_NOTIFY_RESUME标记。

do_notify_resume函数

729 /*
730 * notification of userspace execution resumption
731 * - triggered by the TIF_WORK_MASK flags
732 */
733 __visible void
734 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
735 {
736 user_exit();
737
738 #ifdef CONFIG_X86_MCE
739 /* notify userspace of pending MCEs */
740 if (thread_info_flags & _TIF_MCE_NOTIFY)
741 mce_notify_process();
742 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
743
744 if (thread_info_flags & _TIF_UPROBE)
745 uprobe_notify_resume(regs);
746
747 /* deal with pending signal delivery */
748 if (thread_info_flags & _TIF_SIGPENDING)
749 do_signal(regs);
750
751 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
752 clear_thread_flag(TIF_NOTIFY_RESUME);
753 tracehook_notify_resume(regs);
754 }
755 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
756 fire_user_return_notifiers();
757
758 user_enter();
759 }

可以看到在其中调用tracehook_notify_resume函数,也包括其他一些如信号处理相关的函数。

tracehook_notify_resume

174 /**
175 * tracehook_notify_resume - report when about to return to user mode
176 * @regs: user-mode registers of @current task
177 *
178 * This is called when %TIF_NOTIFY_RESUME has been set. Now we are
179 * about to return to user mode, and the user state in @regs can be
180 * inspected or adjusted. The caller in arch code has cleared
181 * %TIF_NOTIFY_RESUME before the call. If the flag gets set again
182 * asynchronously, this will be called again before we return to
183 * user mode.
184 *
185 * Called without locks.
186 */
187 static inline void tracehook_notify_resume(struct pt_regs *regs)
188 {
189 /*
190 * The caller just cleared TIF_NOTIFY_RESUME. This barrier
191 * pairs with task_work_add()->set_notify_resume() after
192 * hlist_add_head(task->task_works);
193 */
194 smp_mb__after_atomic();
195 if (unlikely(current->task_works))
196 task_work_run();
197 }

在进程对象的task_works不为null的情况下才有任务需要执行。

task_work_run

 77 /**
78 * task_work_run - execute the works added by task_work_add()
79 *
80 * Flush the pending works. Should be used by the core kernel code.
81 * Called before the task returns to the user-mode or stops, or when
82 * it exits. In the latter case task_work_add() can no longer add the
83 * new work after task_work_run() returns.
84 */
85 void task_work_run(void)
86 {
87 struct task_struct *task = current;
88 struct callback_head *work, *head, *next;
89
90 for (;;) {
91 /*
92 * work->func() can do task_work_add(), do not set
93 * work_exited unless the list is empty.
94 */
95 do {
96 work = ACCESS_ONCE(task->task_works);
97 head = !work && (task->flags & PF_EXITING) ?
98 &work_exited : NULL;
99 } while (cmpxchg(&task->task_works, work, head) != work);
100
101 if (!work)
102 break;
103 /*
104 * Synchronize with task_work_cancel(). It can't remove
105 * the first entry == work, cmpxchg(task_works) should
106 * fail, but it can play with *work and other entries.
107 */
108 raw_spin_unlock_wait(&task->pi_lock);
109 smp_mb();
110
111 /* Reverse the list to run the works in fifo order */
112 head = NULL;
113 do {
114 next = work->next;
115 work->next = head;
116 head = work;
117 work = next;
118 } while (work);
119
120 work = head;
121 do {
122 next = work->next;
123 work->func(work);
124 work = next;
125 cond_resched();
126 } while (work);
127 }
128 }

1. 通过CAS,以无锁的方式取得task_works链表

2. 因为原链表是按元素添加到链表的时间逆序排列的(见task_work_add),先把链表反转一遍

3. 反转链表后,遍历链表,执行各个元素的任务函数即work->func(work)