posix-cpu-timers.c source code [linux/kernel/time/posix-cpu-timers.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Implement CPU time clocks for the POSIX clock interface.
4	*/
5
6	#include <linux/sched/signal.h>
7	#include <linux/sched/cputime.h>
8	#include <linux/posix-timers.h>
9	#include <linux/errno.h>
10	#include <linux/math64.h>
11	#include <linux/uaccess.h>
12	#include <linux/kernel_stat.h>
13	#include <trace/events/timer.h>
14	#include <linux/tick.h>
15	#include <linux/workqueue.h>
16	#include <linux/compat.h>
17	#include <linux/sched/deadline.h>
18	#include <linux/task_work.h>
19
20	#include "posix-timers.h"
21
22	static void posix_cpu_timer_rearm(struct k_itimer *timer);
23
24	void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
25	{
26	posix_cputimers_init(pct);
27	if (cpu_limit != RLIM_INFINITY) {
28	pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
29	pct->timers_active = true;
30	}
31	}
32
33	/*
34	* Called after updating RLIMIT_CPU to run cpu timer and update
35	* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
36	* necessary. Needs siglock protection since other code may update the
37	* expiration cache as well.
38	*
39	* Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
40	* we cannot lock_task_sighand. Cannot fail if task is current.
41	*/
42	int update_rlimit_cpu(struct task_struct task, unsigned* long rlim_new)
43	{
44	u64 nsecs = rlim_new * NSEC_PER_SEC;
45	unsigned long irq_fl;
46
47	if (!lock_task_sighand(task, flags: &irq_fl))
48	return -ESRCH;
49	set_process_cpu_timer(task, CPUCLOCK_PROF, newval: &nsecs, NULL);
50	unlock_task_sighand(task, flags: &irq_fl);
51	return `0`;
52	}
53
54	/*
55	* Functions for validating access to tasks.
56	*/
57	static struct pid pid_for_clock(const* clockid_t clock, bool gettime)
58	{
59	const bool thread = !!CPUCLOCK_PERTHREAD(clock);
60	const pid_t upid = CPUCLOCK_PID(clock);
61	struct pid *pid;
62
63	if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
64	return NULL;
65
66	/*
67	* If the encoded PID is 0, then the timer is targeted at current
68	* or the process to which current belongs.
69	*/
70	if (upid == `0`)
71	return thread ? task_pid(current) : task_tgid(current);
72
73	pid = find_vpid(nr: upid);
74	if (!pid)
75	return NULL;
76
77	if (thread) {
78	struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
79	return (tsk && same_thread_group(p1: tsk, current)) ? pid : NULL;
80	}
81
82	/*
83	* For clock_gettime(PROCESS) allow finding the process by
84	* with the pid of the current task. The code needs the tgid
85	* of the process so that pid_task(pid, PIDTYPE_TGID) can be
86	* used to find the process.
87	*/
88	if (gettime && (pid == task_pid(current)))
89	return task_tgid(current);
90
91	/*
92	* For processes require that pid identifies a process.
93	*/
94	return pid_has_task(pid, type: PIDTYPE_TGID) ? pid : NULL;
95	}
96
97	static inline int validate_clock_permissions(const clockid_t clock)
98	{
99	int ret;
100
101	rcu_read_lock();
102	ret = pid_for_clock(clock, gettime: false) ? `0` : -EINVAL;
103	rcu_read_unlock();
104
105	return ret;
106	}
107
108	static inline enum pid_type clock_pid_type(const clockid_t clock)
109	{
110	return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID;
111	}
112
113	static inline struct task_struct cpu_timer_task_rcu(struct* k_itimer *timer)
114	{
115	return pid_task(pid: timer->it.cpu.pid, clock_pid_type(clock: timer->it_clock));
116	}
117
118	/*
119	* Update expiry time from increment, and increase overrun count,
120	* given the current clock sample.
121	*/
122	static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
123	{
124	u64 delta, incr, expires = timer->it.cpu.node.expires;
125	int i;
126
127	if (!timer->it_interval)
128	return expires;
129
130	if (now < expires)
131	return expires;
132
133	incr = timer->it_interval;
134	delta = now + incr - expires;
135
136	/ Don't use (incr2 < delta), incr2 might overflow. /
137	for (i = `0`; incr < delta - incr; i++)
138	incr = incr << `1`;
139
140	for (; i >= `0`; incr >>= `1`, i--) {
141	if (delta < incr)
142	continue;
143
144	timer->it.cpu.node.expires += incr;
145	timer->it_overrun += `1LL` << i;
146	delta -= incr;
147	}
148	return timer->it.cpu.node.expires;
149	}
150
151	/ Check whether all cache entries contain U64_MAX, i.e. eternal expiry time /
152	static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
153	{
154	return !(~pct->bases[CPUCLOCK_PROF].nextevt \|
155	~pct->bases[CPUCLOCK_VIRT].nextevt \|
156	~pct->bases[CPUCLOCK_SCHED].nextevt);
157	}
158
159	static int
160	posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
161	{
162	int error = validate_clock_permissions(clock: which_clock);
163
164	if (!error) {
165	tp->tv_sec = `0`;
166	tp->tv_nsec = ((NSEC_PER_SEC + HZ - `1`) / HZ);
167	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
168	/*
169	* If sched_clock is using a cycle counter, we
170	* don't have any idea of its true resolution
171	* exported, but it is much more than 1s/HZ.
172	*/
173	tp->tv_nsec = `1`;
174	}
175	}
176	return error;
177	}
178
179	static int
180	posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
181	{
182	int error = validate_clock_permissions(clock);
183
184	/*
185	* You can never reset a CPU clock, but we check for other errors
186	* in the call before failing with EPERM.
187	*/
188	return error ? : -EPERM;
189	}
190
191	/*
192	* Sample a per-thread clock for the given task. clkid is validated.
193	*/
194	static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
195	{
196	u64 utime, stime;
197
198	if (clkid == CPUCLOCK_SCHED)
199	return task_sched_runtime(task: p);
200
201	task_cputime(t: p, utime: &utime, stime: &stime);
202
203	switch (clkid) {
204	case CPUCLOCK_PROF:
205	return utime + stime;
206	case CPUCLOCK_VIRT:
207	return utime;
208	default:
209	WARN_ON_ONCE(`1`);
210	}
211	return `0`;
212	}
213
214	static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
215	{
216	samples[CPUCLOCK_PROF] = stime + utime;
217	samples[CPUCLOCK_VIRT] = utime;
218	samples[CPUCLOCK_SCHED] = rtime;
219	}
220
221	static void task_sample_cputime(struct task_struct p, u64 samples)
222	{
223	u64 stime, utime;
224
225	task_cputime(t: p, utime: &utime, stime: &stime);
226	store_samples(samples, stime, utime, rtime: p->se.sum_exec_runtime);
227	}
228
229	static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
230	u64 *samples)
231	{
232	u64 stime, utime, rtime;
233
234	utime = atomic64_read(v: &at->utime);
235	stime = atomic64_read(v: &at->stime);
236	rtime = atomic64_read(v: &at->sum_exec_runtime);
237	store_samples(samples, stime, utime, rtime);
238	}
239
240	/*
241	* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
242	* to avoid race conditions with concurrent updates to cputime.
243	*/
244	static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
245	{
246	u64 curr_cputime = atomic64_read(v: cputime);
247
248	do {
249	if (sum_cputime <= curr_cputime)
250	return;
251	} while (!atomic64_try_cmpxchg(v: cputime, old: &curr_cputime, new: sum_cputime));
252	}
253
254	static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
255	struct task_cputime *sum)
256	{
257	__update_gt_cputime(cputime: &cputime_atomic->utime, sum_cputime: sum->utime);
258	__update_gt_cputime(cputime: &cputime_atomic->stime, sum_cputime: sum->stime);
259	__update_gt_cputime(cputime: &cputime_atomic->sum_exec_runtime, sum_cputime: sum->sum_exec_runtime);
260	}
261
262	/**
263	* thread_group_sample_cputime - Sample cputime for a given task
264	* @tsk: Task for which cputime needs to be started
265	* @samples: Storage for time samples
266	*
267	* Called from sys_getitimer() to calculate the expiry time of an active
268	* timer. That means group cputime accounting is already active. Called
269	* with task sighand lock held.
270	*
271	* Updates @times with an uptodate sample of the thread group cputimes.
272	*/
273	void thread_group_sample_cputime(struct task_struct tsk, u64 samples)
274	{
275	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
276	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
277
278	WARN_ON_ONCE(!pct->timers_active);
279
280	proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples);
281	}
282
283	/**
284	* thread_group_start_cputime - Start cputime and return a sample
285	* @tsk: Task for which cputime needs to be started
286	* @samples: Storage for time samples
287	*
288	* The thread group cputime accounting is avoided when there are no posix
289	* CPU timers armed. Before starting a timer it's required to check whether
290	* the time accounting is active. If not, a full update of the atomic
291	* accounting store needs to be done and the accounting enabled.
292	*
293	* Updates @times with an uptodate sample of the thread group cputimes.
294	*/
295	static void thread_group_start_cputime(struct task_struct tsk, u64 samples)
296	{
297	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
298	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
299
300	lockdep_assert_task_sighand_held(task: tsk);
301
302	/ Check if cputimer isn't running. This is accessed without locking. /
303	if (!READ_ONCE(pct->timers_active)) {
304	struct task_cputime sum;
305
306	/*
307	* The POSIX timer interface allows for absolute time expiry
308	* values through the TIMER_ABSTIME flag, therefore we have
309	* to synchronize the timer to the clock every time we start it.
310	*/
311	thread_group_cputime(tsk, times: &sum);
312	update_gt_cputime(cputime_atomic: &cputimer->cputime_atomic, sum: &sum);
313
314	/*
315	* We're setting timers_active without a lock. Ensure this
316	* only gets written to in one operation. We set it after
317	* update_gt_cputime() as a small optimization, but
318	* barriers are not required because update_gt_cputime()
319	* can handle concurrent updates.
320	*/
321	WRITE_ONCE(pct->timers_active, true);
322	}
323	proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples);
324	}
325
326	static void __thread_group_cputime(struct task_struct tsk, u64 samples)
327	{
328	struct task_cputime ct;
329
330	thread_group_cputime(tsk, times: &ct);
331	store_samples(samples, stime: ct.stime, utime: ct.utime, rtime: ct.sum_exec_runtime);
332	}
333
334	/*
335	* Sample a process (thread group) clock for the given task clkid. If the
336	* group's cputime accounting is already enabled, read the atomic
337	* store. Otherwise a full update is required. clkid is already validated.
338	*/
339	static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
340	bool start)
341	{
342	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
343	struct posix_cputimers *pct = &p->signal->posix_cputimers;
344	u64 samples[CPUCLOCK_MAX];
345
346	if (!READ_ONCE(pct->timers_active)) {
347	if (start)
348	thread_group_start_cputime(tsk: p, samples);
349	else
350	__thread_group_cputime(tsk: p, samples);
351	} else {
352	proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples);
353	}
354
355	return samples[clkid];
356	}
357
358	static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
359	{
360	const clockid_t clkid = CPUCLOCK_WHICH(clock);
361	struct task_struct *tsk;
362	u64 t;
363
364	rcu_read_lock();
365	tsk = pid_task(pid: pid_for_clock(clock, gettime: true), clock_pid_type(clock));
366	if (!tsk) {
367	rcu_read_unlock();
368	return -EINVAL;
369	}
370
371	if (CPUCLOCK_PERTHREAD(clock))
372	t = cpu_clock_sample(clkid, p: tsk);
373	else
374	t = cpu_clock_sample_group(clkid, p: tsk, start: false);
375	rcu_read_unlock();
376
377	*tp = ns_to_timespec64(nsec: t);
378	return `0`;
379	}
380
381	/*
382	* Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
383	* This is called from sys_timer_create() and do_cpu_nanosleep() with the
384	* new timer already all-zeros initialized.
385	*/
386	static int posix_cpu_timer_create(struct k_itimer *new_timer)
387	{
388	static struct lock_class_key posix_cpu_timers_key;
389	struct pid *pid;
390
391	rcu_read_lock();
392	pid = pid_for_clock(clock: new_timer->it_clock, gettime: false);
393	if (!pid) {
394	rcu_read_unlock();
395	return -EINVAL;
396	}
397
398	/*
399	* If posix timer expiry is handled in task work context then
400	* timer::it_lock can be taken without disabling interrupts as all
401	* other locking happens in task context. This requires a separate
402	* lock class key otherwise regular posix timer expiry would record
403	* the lock class being taken in interrupt context and generate a
404	* false positive warning.
405	*/
406	if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
407	lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
408
409	new_timer->kclock = &clock_posix_cpu;
410	timerqueue_init(node: &new_timer->it.cpu.node);
411	new_timer->it.cpu.pid = get_pid(pid);
412	rcu_read_unlock();
413	return `0`;
414	}
415
416	static struct posix_cputimer_base timer_base(struct* k_itimer *timer,
417	struct task_struct *tsk)
418	{
419	int clkidx = CPUCLOCK_WHICH(timer->it_clock);
420
421	if (CPUCLOCK_PERTHREAD(timer->it_clock))
422	return tsk->posix_cputimers.bases + clkidx;
423	else
424	return tsk->signal->posix_cputimers.bases + clkidx;
425	}
426
427	/*
428	* Force recalculating the base earliest expiration on the next tick.
429	* This will also re-evaluate the need to keep around the process wide
430	* cputime counter and tick dependency and eventually shut these down
431	* if necessary.
432	*/
433	static void trigger_base_recalc_expires(struct k_itimer *timer,
434	struct task_struct *tsk)
435	{
436	struct posix_cputimer_base *base = timer_base(timer, tsk);
437
438	base->nextevt = `0`;
439	}
440
441	/*
442	* Dequeue the timer and reset the base if it was its earliest expiration.
443	* It makes sure the next tick recalculates the base next expiration so we
444	* don't keep the costly process wide cputime counter around for a random
445	* amount of time, along with the tick dependency.
446	*
447	* If another timer gets queued between this and the next tick, its
448	* expiration will update the base next event if necessary on the next
449	* tick.
450	*/
451	static void disarm_timer(struct k_itimer timer, struct* task_struct *p)
452	{
453	struct cpu_timer *ctmr = &timer->it.cpu;
454	struct posix_cputimer_base *base;
455
456	if (!cpu_timer_dequeue(ctmr))
457	return;
458
459	base = timer_base(timer, tsk: p);
460	if (cpu_timer_getexpires(ctmr) == base->nextevt)
461	trigger_base_recalc_expires(timer, tsk: p);
462	}
463
464
465	/*
466	* Clean up a CPU-clock timer that is about to be destroyed.
467	* This is called from timer deletion with the timer already locked.
468	* If we return TIMER_RETRY, it's necessary to release the timer's lock
469	* and try again. (This happens when the timer is in the middle of firing.)
470	*/
471	static int posix_cpu_timer_del(struct k_itimer *timer)
472	{
473	struct cpu_timer *ctmr = &timer->it.cpu;
474	struct sighand_struct *sighand;
475	struct task_struct *p;
476	unsigned long flags;
477	int ret = `0`;
478
479	rcu_read_lock();
480	p = cpu_timer_task_rcu(timer);
481	if (!p)
482	goto out;
483
484	/*
485	* Protect against sighand release/switch in exit/exec and process/
486	* thread timer list entry concurrent read/writes.
487	*/
488	sighand = lock_task_sighand(task: p, flags: &flags);
489	if (unlikely(sighand == NULL)) {
490	/*
491	* This raced with the reaping of the task. The exit cleanup
492	* should have removed this timer from the timer queue.
493	*/
494	WARN_ON_ONCE(ctmr->head \|\| timerqueue_node_queued(&ctmr->node));
495	} else {
496	if (timer->it.cpu.firing) {
497	/*
498	* Prevent signal delivery. The timer cannot be dequeued
499	* because it is on the firing list which is not protected
500	* by sighand->lock. The delivery path is waiting for
501	* the timer lock. So go back, unlock and retry.
502	*/
503	timer->it.cpu.firing = false;
504	ret = TIMER_RETRY;
505	} else {
506	disarm_timer(timer, p);
507	}
508	unlock_task_sighand(task: p, flags: &flags);
509	}
510
511	out:
512	rcu_read_unlock();
513
514	if (!ret) {
515	put_pid(pid: ctmr->pid);
516	timer->it_status = POSIX_TIMER_DISARMED;
517	}
518	return ret;
519	}
520
521	static void cleanup_timerqueue(struct timerqueue_head *head)
522	{
523	struct timerqueue_node *node;
524	struct cpu_timer *ctmr;
525
526	while ((node = timerqueue_getnext(head))) {
527	timerqueue_del(head, node);
528	ctmr = container_of(node, struct cpu_timer, node);
529	ctmr->head = NULL;
530	}
531	}
532
533	/*
534	* Clean out CPU timers which are still armed when a thread exits. The
535	* timers are only removed from the list. No other updates are done. The
536	* corresponding posix timers are still accessible, but cannot be rearmed.
537	*
538	* This must be called with the siglock held.
539	*/
540	static void cleanup_timers(struct posix_cputimers *pct)
541	{
542	cleanup_timerqueue(head: &pct->bases[CPUCLOCK_PROF].tqhead);
543	cleanup_timerqueue(head: &pct->bases[CPUCLOCK_VIRT].tqhead);
544	cleanup_timerqueue(head: &pct->bases[CPUCLOCK_SCHED].tqhead);
545	}
546
547	/*
548	* These are both called with the siglock held, when the current thread
549	* is being reaped. When the final (leader) thread in the group is reaped,
550	* posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
551	*/
552	void posix_cpu_timers_exit(struct task_struct *tsk)
553	{
554	cleanup_timers(pct: &tsk->posix_cputimers);
555	}
556	void posix_cpu_timers_exit_group(struct task_struct *tsk)
557	{
558	cleanup_timers(pct: &tsk->signal->posix_cputimers);
559	}
560
561	/*
562	* Insert the timer on the appropriate list before any timers that
563	* expire later. This must be called with the sighand lock held.
564	*/
565	static void arm_timer(struct k_itimer timer, struct* task_struct *p)
566	{
567	struct posix_cputimer_base *base = timer_base(timer, tsk: p);
568	struct cpu_timer *ctmr = &timer->it.cpu;
569	u64 newexp = cpu_timer_getexpires(ctmr);
570
571	timer->it_status = POSIX_TIMER_ARMED;
572	if (!cpu_timer_enqueue(head: &base->tqhead, ctmr))
573	return;
574
575	/*
576	* We are the new earliest-expiring POSIX 1.b timer, hence
577	* need to update expiration cache. Take into account that
578	* for process timers we share expiration cache with itimers
579	* and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
580	*/
581	if (newexp < base->nextevt)
582	base->nextevt = newexp;
583
584	if (CPUCLOCK_PERTHREAD(timer->it_clock))
585	tick_dep_set_task(tsk: p, bit: TICK_DEP_BIT_POSIX_TIMER);
586	else
587	tick_dep_set_signal(tsk: p, bit: TICK_DEP_BIT_POSIX_TIMER);
588	}
589
590	/*
591	* The timer is locked, fire it and arrange for its reload.
592	*/
593	static void cpu_timer_fire(struct k_itimer *timer)
594	{
595	struct cpu_timer *ctmr = &timer->it.cpu;
596
597	timer->it_status = POSIX_TIMER_DISARMED;
598
599	if (unlikely(ctmr->nanosleep)) {
600	/*
601	* This a special case for clock_nanosleep,
602	* not a normal timer from sys_timer_create.
603	*/
604	wake_up_process(tsk: timer->it_process);
605	cpu_timer_setexpires(ctmr, exp: `0`);
606	} else {
607	posix_timer_queue_signal(timr: timer);
608	/ Disable oneshot timers /
609	if (!timer->it_interval)
610	cpu_timer_setexpires(ctmr, exp: `0`);
611	}
612	}
613
614	static void __posix_cpu_timer_get(struct k_itimer timer, struct* itimerspec64 *itp, u64 now);
615
616	/*
617	* Guts of sys_timer_settime for CPU timers.
618	* This is called with the timer locked and interrupts disabled.
619	* If we return TIMER_RETRY, it's necessary to release the timer's lock
620	* and try again. (This happens when the timer is in the middle of firing.)
621	*/
622	static int posix_cpu_timer_set(struct k_itimer timer, int* timer_flags,
623	struct itimerspec64 new, struct* itimerspec64 *old)
624	{
625	bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
626	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
627	struct cpu_timer *ctmr = &timer->it.cpu;
628	u64 old_expires, new_expires, now;
629	struct sighand_struct *sighand;
630	struct task_struct *p;
631	unsigned long flags;
632	int ret = `0`;
633
634	rcu_read_lock();
635	p = cpu_timer_task_rcu(timer);
636	if (!p) {
637	/*
638	* If p has just been reaped, we can no
639	* longer get any information about it at all.
640	*/
641	rcu_read_unlock();
642	return -ESRCH;
643	}
644
645	/*
646	* Use the to_ktime conversion because that clamps the maximum
647	* value to KTIME_MAX and avoid multiplication overflows.
648	*/
649	new_expires = ktime_to_ns(kt: timespec64_to_ktime(ts: new->it_value));
650
651	/*
652	* Protect against sighand release/switch in exit/exec and p->cpu_timers
653	* and p->signal->cpu_timers read/write in arm_timer()
654	*/
655	sighand = lock_task_sighand(task: p, flags: &flags);
656	/*
657	* If p has just been reaped, we can no
658	* longer get any information about it at all.
659	*/
660	if (unlikely(sighand == NULL)) {
661	rcu_read_unlock();
662	return -ESRCH;
663	}
664
665	/ Retrieve the current expiry time before disarming the timer /
666	old_expires = cpu_timer_getexpires(ctmr);
667
668	if (unlikely(timer->it.cpu.firing)) {
669	/*
670	* Prevent signal delivery. The timer cannot be dequeued
671	* because it is on the firing list which is not protected
672	* by sighand->lock. The delivery path is waiting for
673	* the timer lock. So go back, unlock and retry.
674	*/
675	timer->it.cpu.firing = false;
676	ret = TIMER_RETRY;
677	} else {
678	cpu_timer_dequeue(ctmr);
679	timer->it_status = POSIX_TIMER_DISARMED;
680	}
681
682	/*
683	* Sample the current clock for saving the previous setting
684	* and for rearming the timer.
685	*/
686	if (CPUCLOCK_PERTHREAD(timer->it_clock))
687	now = cpu_clock_sample(clkid, p);
688	else
689	now = cpu_clock_sample_group(clkid, p, start: !sigev_none);
690
691	/ Retrieve the previous expiry value if requested. /
692	if (old) {
693	old->it_value = (struct timespec64){ };
694	if (old_expires)
695	__posix_cpu_timer_get(timer, itp: old, now);
696	}
697
698	/ Retry if the timer expiry is running concurrently /
699	if (unlikely(ret)) {
700	unlock_task_sighand(task: p, flags: &flags);
701	goto out;
702	}
703
704	/ Convert relative expiry time to absolute /
705	if (new_expires && !(timer_flags & TIMER_ABSTIME))
706	new_expires += now;
707
708	/ Set the new expiry time (might be 0) /
709	cpu_timer_setexpires(ctmr, exp: new_expires);
710
711	/*
712	* Arm the timer if it is not disabled, the new expiry value has
713	* not yet expired and the timer requires signal delivery.
714	* SIGEV_NONE timers are never armed. In case the timer is not
715	* armed, enforce the reevaluation of the timer base so that the
716	* process wide cputime counter can be disabled eventually.
717	*/
718	if (likely(!sigev_none)) {
719	if (new_expires && now < new_expires)
720	arm_timer(timer, p);
721	else
722	trigger_base_recalc_expires(timer, tsk: p);
723	}
724
725	unlock_task_sighand(task: p, flags: &flags);
726
727	posix_timer_set_common(timer, new_setting: new);
728
729	/*
730	* If the new expiry time was already in the past the timer was not
731	* queued. Fire it immediately even if the thread never runs to
732	* accumulate more time on this clock.
733	*/
734	if (!sigev_none && new_expires && now >= new_expires)
735	cpu_timer_fire(timer);
736	out:
737	rcu_read_unlock();
738	return ret;
739	}
740
741	static void __posix_cpu_timer_get(struct k_itimer timer, struct* itimerspec64 *itp, u64 now)
742	{
743	bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
744	u64 expires, iv = timer->it_interval;
745
746	/*
747	* Make sure that interval timers are moved forward for the
748	* following cases:
749	* - SIGEV_NONE timers which are never armed
750	* - Timers which expired, but the signal has not yet been
751	* delivered
752	*/
753	if (iv && timer->it_status != POSIX_TIMER_ARMED)
754	expires = bump_cpu_timer(timer, now);
755	else
756	expires = cpu_timer_getexpires(ctmr: &timer->it.cpu);
757
758	/*
759	* Expired interval timers cannot have a remaining time <= 0.
760	* The kernel has to move them forward so that the next
761	* timer expiry is > @now.
762	*/
763	if (now < expires) {
764	itp->it_value = ns_to_timespec64(nsec: expires - now);
765	} else {
766	/*
767	* A single shot SIGEV_NONE timer must return 0, when it is
768	* expired! Timers which have a real signal delivery mode
769	* must return a remaining time greater than 0 because the
770	* signal has not yet been delivered.
771	*/
772	if (!sigev_none)
773	itp->it_value.tv_nsec = `1`;
774	}
775	}
776
777	static void posix_cpu_timer_get(struct k_itimer timer, struct* itimerspec64 *itp)
778	{
779	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
780	struct task_struct *p;
781	u64 now;
782
783	rcu_read_lock();
784	p = cpu_timer_task_rcu(timer);
785	if (p && cpu_timer_getexpires(ctmr: &timer->it.cpu)) {
786	itp->it_interval = ktime_to_timespec64(timer->it_interval);
787
788	if (CPUCLOCK_PERTHREAD(timer->it_clock))
789	now = cpu_clock_sample(clkid, p);
790	else
791	now = cpu_clock_sample_group(clkid, p, start: false);
792
793	__posix_cpu_timer_get(timer, itp, now);
794	}
795	rcu_read_unlock();
796	}
797
798	#define MAX_COLLECTED 20
799
800	static u64 collect_timerqueue(struct timerqueue_head *head,
801	struct list_head *firing, u64 now)
802	{
803	struct timerqueue_node *next;
804	int i = `0`;
805
806	while ((next = timerqueue_getnext(head))) {
807	struct cpu_timer *ctmr;
808	u64 expires;
809
810	ctmr = container_of(next, struct cpu_timer, node);
811	expires = cpu_timer_getexpires(ctmr);
812	/ Limit the number of timers to expire at once /
813	if (++i == MAX_COLLECTED \|\| now < expires)
814	return expires;
815
816	ctmr->firing = true;
817	/ See posix_cpu_timer_wait_running() /
818	rcu_assign_pointer(ctmr->handling, current);
819	cpu_timer_dequeue(ctmr);
820	list_add_tail(new: &ctmr->elist, head: firing);
821	}
822
823	return U64_MAX;
824	}
825
826	static void collect_posix_cputimers(struct posix_cputimers pct, u64 samples,
827	struct list_head *firing)
828	{
829	struct posix_cputimer_base *base = pct->bases;
830	int i;
831
832	for (i = `0`; i < CPUCLOCK_MAX; i++, base++) {
833	base->nextevt = collect_timerqueue(head: &base->tqhead, firing,
834	now: samples[i]);
835	}
836	}
837
838	static inline void check_dl_overrun(struct task_struct *tsk)
839	{
840	if (tsk->dl.dl_overrun) {
841	tsk->dl.dl_overrun = `0`;
842	send_signal_locked(SIGXCPU, SEND_SIG_PRIV, p: tsk, type: PIDTYPE_TGID);
843	}
844	}
845
846	static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
847	{
848	if (time < limit)
849	return false;
850
851	if (print_fatal_signals) {
852	pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
853	rt ? "RT" : "CPU", hard ? "hard" : "soft",
854	current->comm, task_pid_nr(current));
855	}
856	send_signal_locked(sig: signo, SEND_SIG_PRIV, current, type: PIDTYPE_TGID);
857	return true;
858	}
859
860	/*
861	* Check for any per-thread CPU timers that have fired and move them off
862	* the tsk->cpu_timers[N] list onto the firing list. Here we update the
863	* tsk->it_*_expires values to reflect the remaining thread CPU timers.
864	*/
865	static void check_thread_timers(struct task_struct *tsk,
866	struct list_head *firing)
867	{
868	struct posix_cputimers *pct = &tsk->posix_cputimers;
869	u64 samples[CPUCLOCK_MAX];
870	unsigned long soft;
871
872	if (dl_task(p: tsk))
873	check_dl_overrun(tsk);
874
875	if (expiry_cache_is_inactive(pct))
876	return;
877
878	task_sample_cputime(p: tsk, samples);
879	collect_posix_cputimers(pct, samples, firing);
880
881	/*
882	* Check for the special case thread timers.
883	*/
884	soft = task_rlimit(task: tsk, RLIMIT_RTTIME);
885	if (soft != RLIM_INFINITY) {
886	/ Task RT timeout is accounted in jiffies. RTTIME is usec /
887	unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
888	unsigned long hard = task_rlimit_max(task: tsk, RLIMIT_RTTIME);
889
890	/ At the hard limit, send SIGKILL. No further action. /
891	if (hard != RLIM_INFINITY &&
892	check_rlimit(time: rttime, limit: hard, SIGKILL, rt: true, hard: true))
893	return;
894
895	/ At the soft limit, send a SIGXCPU every second /
896	if (check_rlimit(time: rttime, limit: soft, SIGXCPU, rt: true, hard: false)) {
897	soft += USEC_PER_SEC;
898	tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
899	}
900	}
901
902	if (expiry_cache_is_inactive(pct))
903	tick_dep_clear_task(tsk, bit: TICK_DEP_BIT_POSIX_TIMER);
904	}
905
906	static inline void stop_process_timers(struct signal_struct *sig)
907	{
908	struct posix_cputimers *pct = &sig->posix_cputimers;
909
910	/ Turn off the active flag. This is done without locking. /
911	WRITE_ONCE(pct->timers_active, false);
912	tick_dep_clear_signal(signal: sig, bit: TICK_DEP_BIT_POSIX_TIMER);
913	}
914
915	static void check_cpu_itimer(struct task_struct tsk, struct* cpu_itimer *it,
916	u64 expires, u64 cur_time, int* signo)
917	{
918	if (!it->expires)
919	return;
920
921	if (cur_time >= it->expires) {
922	if (it->incr)
923	it->expires += it->incr;
924	else
925	it->expires = `0`;
926
927	trace_itimer_expire(which: signo == SIGPROF ?
928	ITIMER_PROF : ITIMER_VIRTUAL,
929	pid: task_tgid(task: tsk), now: cur_time);
930	send_signal_locked(sig: signo, SEND_SIG_PRIV, p: tsk, type: PIDTYPE_TGID);
931	}
932
933	if (it->expires && it->expires < *expires)
934	*expires = it->expires;
935	}
936
937	/*
938	* Check for any per-thread CPU timers that have fired and move them
939	* off the tsk->*_timers list onto the firing list. Per-thread timers
940	* have already been taken off.
941	*/
942	static void check_process_timers(struct task_struct *tsk,
943	struct list_head *firing)
944	{
945	struct signal_struct *const sig = tsk->signal;
946	struct posix_cputimers *pct = &sig->posix_cputimers;
947	u64 samples[CPUCLOCK_MAX];
948	unsigned long soft;
949
950	/*
951	* If there are no active process wide timers (POSIX 1.b, itimers,
952	* RLIMIT_CPU) nothing to check. Also skip the process wide timer
953	* processing when there is already another task handling them.
954	*/
955	if (!READ_ONCE(pct->timers_active) \|\| pct->expiry_active)
956	return;
957
958	/*
959	* Signify that a thread is checking for process timers.
960	* Write access to this field is protected by the sighand lock.
961	*/
962	pct->expiry_active = true;
963
964	/*
965	* Collect the current process totals. Group accounting is active
966	* so the sample can be taken directly.
967	*/
968	proc_sample_cputime_atomic(at: &sig->cputimer.cputime_atomic, samples);
969	collect_posix_cputimers(pct, samples, firing);
970
971	/*
972	* Check for the special case process timers.
973	*/
974	check_cpu_itimer(tsk, it: &sig->it[CPUCLOCK_PROF],
975	expires: &pct->bases[CPUCLOCK_PROF].nextevt,
976	cur_time: samples[CPUCLOCK_PROF], SIGPROF);
977	check_cpu_itimer(tsk, it: &sig->it[CPUCLOCK_VIRT],
978	expires: &pct->bases[CPUCLOCK_VIRT].nextevt,
979	cur_time: samples[CPUCLOCK_VIRT], SIGVTALRM);
980
981	soft = task_rlimit(task: tsk, RLIMIT_CPU);
982	if (soft != RLIM_INFINITY) {
983	/ RLIMIT_CPU is in seconds. Samples are nanoseconds /
984	unsigned long hard = task_rlimit_max(task: tsk, RLIMIT_CPU);
985	u64 ptime = samples[CPUCLOCK_PROF];
986	u64 softns = (u64)soft * NSEC_PER_SEC;
987	u64 hardns = (u64)hard * NSEC_PER_SEC;
988
989	/ At the hard limit, send SIGKILL. No further action. /
990	if (hard != RLIM_INFINITY &&
991	check_rlimit(time: ptime, limit: hardns, SIGKILL, rt: false, hard: true))
992	return;
993
994	/ At the soft limit, send a SIGXCPU every second /
995	if (check_rlimit(time: ptime, limit: softns, SIGXCPU, rt: false, hard: false)) {
996	sig->rlim[RLIMIT_CPU].rlim_cur = soft + `1`;
997	softns += NSEC_PER_SEC;
998	}
999
1000	/ Update the expiry cache /
1001	if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
1002	pct->bases[CPUCLOCK_PROF].nextevt = softns;
1003	}
1004
1005	if (expiry_cache_is_inactive(pct))
1006	stop_process_timers(sig);
1007
1008	pct->expiry_active = false;
1009	}
1010
1011	/*
1012	* This is called from the signal code (via posixtimer_rearm)
1013	* when the last timer signal was delivered and we have to reload the timer.
1014	*/
1015	static void posix_cpu_timer_rearm(struct k_itimer *timer)
1016	{
1017	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
1018	struct task_struct *p;
1019	struct sighand_struct *sighand;
1020	unsigned long flags;
1021	u64 now;
1022
1023	rcu_read_lock();
1024	p = cpu_timer_task_rcu(timer);
1025	if (!p)
1026	goto out;
1027
1028	/ Protect timer list r/w in arm_timer() /
1029	sighand = lock_task_sighand(task: p, flags: &flags);
1030	if (unlikely(sighand == NULL))
1031	goto out;
1032
1033	/*
1034	* Fetch the current sample and update the timer's expiry time.
1035	*/
1036	if (CPUCLOCK_PERTHREAD(timer->it_clock))
1037	now = cpu_clock_sample(clkid, p);
1038	else
1039	now = cpu_clock_sample_group(clkid, p, start: true);
1040
1041	bump_cpu_timer(timer, now);
1042
1043	/*
1044	* Now re-arm for the new expiry time.
1045	*/
1046	arm_timer(timer, p);
1047	unlock_task_sighand(task: p, flags: &flags);
1048	out:
1049	rcu_read_unlock();
1050	}
1051
1052	/**
1053	* task_cputimers_expired - Check whether posix CPU timers are expired
1054	*
1055	* @samples: Array of current samples for the CPUCLOCK clocks
1056	* @pct: Pointer to a posix_cputimers container
1057	*
1058	* Returns true if any member of @samples is greater than the corresponding
1059	* member of @pct->bases[CLK].nextevt. False otherwise
1060	*/
1061	static inline bool
1062	task_cputimers_expired(const u64 samples, struct* posix_cputimers *pct)
1063	{
1064	int i;
1065
1066	for (i = `0`; i < CPUCLOCK_MAX; i++) {
1067	if (samples[i] >= pct->bases[i].nextevt)
1068	return true;
1069	}
1070	return false;
1071	}
1072
1073	/**
1074	* fastpath_timer_check - POSIX CPU timers fast path.
1075	*
1076	* @tsk: The task (thread) being checked.
1077	*
1078	* Check the task and thread group timers. If both are zero (there are no
1079	* timers set) return false. Otherwise snapshot the task and thread group
1080	* timers and compare them with the corresponding expiration times. Return
1081	* true if a timer has expired, else return false.
1082	*/
1083	static inline bool fastpath_timer_check(struct task_struct *tsk)
1084	{
1085	struct posix_cputimers *pct = &tsk->posix_cputimers;
1086	struct signal_struct *sig;
1087
1088	if (!expiry_cache_is_inactive(pct)) {
1089	u64 samples[CPUCLOCK_MAX];
1090
1091	task_sample_cputime(p: tsk, samples);
1092	if (task_cputimers_expired(samples, pct))
1093	return true;
1094	}
1095
1096	sig = tsk->signal;
1097	pct = &sig->posix_cputimers;
1098	/*
1099	* Check if thread group timers expired when timers are active and
1100	* no other thread in the group is already handling expiry for
1101	* thread group cputimers. These fields are read without the
1102	* sighand lock. However, this is fine because this is meant to be
1103	* a fastpath heuristic to determine whether we should try to
1104	* acquire the sighand lock to handle timer expiry.
1105	*
1106	* In the worst case scenario, if concurrently timers_active is set
1107	* or expiry_active is cleared, but the current thread doesn't see
1108	* the change yet, the timer checks are delayed until the next
1109	* thread in the group gets a scheduler interrupt to handle the
1110	* timer. This isn't an issue in practice because these types of
1111	* delays with signals actually getting sent are expected.
1112	*/
1113	if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
1114	u64 samples[CPUCLOCK_MAX];
1115
1116	proc_sample_cputime_atomic(at: &sig->cputimer.cputime_atomic,
1117	samples);
1118
1119	if (task_cputimers_expired(samples, pct))
1120	return true;
1121	}
1122
1123	if (dl_task(p: tsk) && tsk->dl.dl_overrun)
1124	return true;
1125
1126	return false;
1127	}
1128
1129	static void handle_posix_cpu_timers(struct task_struct *tsk);
1130
1131	#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
1132	static void posix_cpu_timers_work(struct callback_head *work)
1133	{
1134	struct posix_cputimers_work cw = container_of(work, typeof(cw), work);
1135
1136	mutex_lock(&cw->mutex);
1137	handle_posix_cpu_timers(current);
1138	mutex_unlock(lock: &cw->mutex);
1139	}
1140
1141	/*
1142	* Invoked from the posix-timer core when a cancel operation failed because
1143	* the timer is marked firing. The caller holds rcu_read_lock(), which
1144	* protects the timer and the task which is expiring it from being freed.
1145	*/
1146	static void posix_cpu_timer_wait_running(struct k_itimer *timr)
1147	{
1148	struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
1149
1150	/ Has the handling task completed expiry already? /
1151	if (!tsk)
1152	return;
1153
1154	/ Ensure that the task cannot go away /
1155	get_task_struct(t: tsk);
1156	/ Now drop the RCU protection so the mutex can be locked /
1157	rcu_read_unlock();
1158	/ Wait on the expiry mutex /
1159	mutex_lock(&tsk->posix_cputimers_work.mutex);
1160	/ Release it immediately again. /
1161	mutex_unlock(lock: &tsk->posix_cputimers_work.mutex);
1162	/ Drop the task reference. /
1163	put_task_struct(t: tsk);
1164	/ Relock RCU so the callsite is balanced /
1165	rcu_read_lock();
1166	}
1167
1168	static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
1169	{
1170	/ Ensure that timr->it.cpu.handling task cannot go away /
1171	rcu_read_lock();
1172	spin_unlock_irq(lock: &timr->it_lock);
1173	posix_cpu_timer_wait_running(timr);
1174	rcu_read_unlock();
1175	/ @timr is on stack and is valid /
1176	spin_lock_irq(lock: &timr->it_lock);
1177	}
1178
1179	/*
1180	* Clear existing posix CPU timers task work.
1181	*/
1182	void clear_posix_cputimers_work(struct task_struct *p)
1183	{
1184	/*
1185	* A copied work entry from the old task is not meaningful, clear it.
1186	* N.B. init_task_work will not do this.
1187	*/
1188	memset(&p->posix_cputimers_work.work, `0`,
1189	sizeof(p->posix_cputimers_work.work));
1190	init_task_work(twork: &p->posix_cputimers_work.work,
1191	func: posix_cpu_timers_work);
1192	mutex_init(&p->posix_cputimers_work.mutex);
1193	p->posix_cputimers_work.scheduled = false;
1194	}
1195
1196	/*
1197	* Initialize posix CPU timers task work in init task. Out of line to
1198	* keep the callback static and to avoid header recursion hell.
1199	*/
1200	void __init posix_cputimers_init_work(void)
1201	{
1202	clear_posix_cputimers_work(current);
1203	}
1204
1205	/*
1206	* Note: All operations on tsk->posix_cputimer_work.scheduled happen either
1207	* in hard interrupt context or in task context with interrupts
1208	* disabled. Aside of that the writer/reader interaction is always in the
1209	* context of the current task, which means they are strict per CPU.
1210	*/
1211	static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1212	{
1213	return tsk->posix_cputimers_work.scheduled;
1214	}
1215
1216	static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1217	{
1218	if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
1219	return;
1220
1221	/ Schedule task work to actually expire the timers /
1222	tsk->posix_cputimers_work.scheduled = true;
1223	task_work_add(task: tsk, twork: &tsk->posix_cputimers_work.work, mode: TWA_RESUME);
1224	}
1225
1226	static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1227	unsigned long start)
1228	{
1229	bool ret = true;
1230
1231	/*
1232	* On !RT kernels interrupts are disabled while collecting expired
1233	* timers, so no tick can happen and the fast path check can be
1234	* reenabled without further checks.
1235	*/
1236	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
1237	tsk->posix_cputimers_work.scheduled = false;
1238	return true;
1239	}
1240
1241	/*
1242	* On RT enabled kernels ticks can happen while the expired timers
1243	* are collected under sighand lock. But any tick which observes
1244	* the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
1245	* checks. So reenabling the tick work has do be done carefully:
1246	*
1247	* Disable interrupts and run the fast path check if jiffies have
1248	* advanced since the collecting of expired timers started. If
1249	* jiffies have not advanced or the fast path check did not find
1250	* newly expired timers, reenable the fast path check in the timer
1251	* interrupt. If there are newly expired timers, return false and
1252	* let the collection loop repeat.
1253	*/
1254	local_irq_disable();
1255	if (start != jiffies && fastpath_timer_check(tsk))
1256	ret = false;
1257	else
1258	tsk->posix_cputimers_work.scheduled = false;
1259	local_irq_enable();
1260
1261	return ret;
1262	}
1263	#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1264	static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1265	{
1266	lockdep_posixtimer_enter();
1267	handle_posix_cpu_timers(tsk);
1268	lockdep_posixtimer_exit();
1269	}
1270
1271	static void posix_cpu_timer_wait_running(struct k_itimer *timr)
1272	{
1273	cpu_relax();
1274	}
1275
1276	static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
1277	{
1278	spin_unlock_irq(&timr->it_lock);
1279	cpu_relax();
1280	spin_lock_irq(&timr->it_lock);
1281	}
1282
1283	static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1284	{
1285	return false;
1286	}
1287
1288	static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1289	unsigned long start)
1290	{
1291	return true;
1292	}
1293	#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1294
1295	static void handle_posix_cpu_timers(struct task_struct *tsk)
1296	{
1297	struct k_itimer timer, next;
1298	unsigned long flags, start;
1299	LIST_HEAD(firing);
1300
1301	if (!lock_task_sighand(task: tsk, flags: &flags))
1302	return;
1303
1304	do {
1305	/*
1306	* On RT locking sighand lock does not disable interrupts,
1307	* so this needs to be careful vs. ticks. Store the current
1308	* jiffies value.
1309	*/
1310	start = READ_ONCE(jiffies);
1311	barrier();
1312
1313	/*
1314	* Here we take off tsk->signal->cpu_timers[N] and
1315	* tsk->cpu_timers[N] all the timers that are firing, and
1316	* put them on the firing list.
1317	*/
1318	check_thread_timers(tsk, firing: &firing);
1319
1320	check_process_timers(tsk, firing: &firing);
1321
1322	/*
1323	* The above timer checks have updated the expiry cache and
1324	* because nothing can have queued or modified timers after
1325	* sighand lock was taken above it is guaranteed to be
1326	* consistent. So the next timer interrupt fastpath check
1327	* will find valid data.
1328	*
1329	* If timer expiry runs in the timer interrupt context then
1330	* the loop is not relevant as timers will be directly
1331	* expired in interrupt context. The stub function below
1332	* returns always true which allows the compiler to
1333	* optimize the loop out.
1334	*
1335	* If timer expiry is deferred to task work context then
1336	* the following rules apply:
1337	*
1338	* - On !RT kernels no tick can have happened on this CPU
1339	* after sighand lock was acquired because interrupts are
1340	* disabled. So reenabling task work before dropping
1341	* sighand lock and reenabling interrupts is race free.
1342	*
1343	* - On RT kernels ticks might have happened but the tick
1344	* work ignored posix CPU timer handling because the
1345	* CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
1346	* must be done very carefully including a check whether
1347	* ticks have happened since the start of the timer
1348	* expiry checks. posix_cpu_timers_enable_work() takes
1349	* care of that and eventually lets the expiry checks
1350	* run again.
1351	*/
1352	} while (!posix_cpu_timers_enable_work(tsk, start));
1353
1354	/*
1355	* We must release sighand lock before taking any timer's lock.
1356	* There is a potential race with timer deletion here, as the
1357	* siglock now protects our private firing list. We have set
1358	* the firing flag in each timer, so that a deletion attempt
1359	* that gets the timer lock before we do will give it up and
1360	* spin until we've taken care of that timer below.
1361	*/
1362	unlock_task_sighand(task: tsk, flags: &flags);
1363
1364	/*
1365	* Now that all the timers on our list have the firing flag,
1366	* no one will touch their list entries but us. We'll take
1367	* each timer's lock before clearing its firing flag, so no
1368	* timer call will interfere.
1369	*/
1370	list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
1371	bool cpu_firing;
1372
1373	/*
1374	* spin_lock() is sufficient here even independent of the
1375	* expiry context. If expiry happens in hard interrupt
1376	* context it's obvious. For task work context it's safe
1377	* because all other operations on timer::it_lock happen in
1378	* task context (syscall or exit).
1379	*/
1380	spin_lock(lock: &timer->it_lock);
1381	list_del_init(entry: &timer->it.cpu.elist);
1382	cpu_firing = timer->it.cpu.firing;
1383	timer->it.cpu.firing = false;
1384	/*
1385	* If the firing flag is cleared then this raced with a
1386	* timer rearm/delete operation. So don't generate an
1387	* event.
1388	*/
1389	if (likely(cpu_firing))
1390	cpu_timer_fire(timer);
1391	/ See posix_cpu_timer_wait_running() /
1392	rcu_assign_pointer(timer->it.cpu.handling, NULL);
1393	spin_unlock(lock: &timer->it_lock);
1394	}
1395	}
1396
1397	/*
1398	* This is called from the timer interrupt handler. The irq handler has
1399	* already updated our counts. We need to check if any timers fire now.
1400	* Interrupts are disabled.
1401	*/
1402	void run_posix_cpu_timers(void)
1403	{
1404	struct task_struct *tsk = current;
1405
1406	lockdep_assert_irqs_disabled();
1407
1408	/*
1409	* Ensure that release_task(tsk) can't happen while
1410	* handle_posix_cpu_timers() is running. Otherwise, a concurrent
1411	* posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
1412	* miss timer->it.cpu.firing != 0.
1413	*/
1414	if (tsk->exit_state)
1415	return;
1416
1417	/*
1418	* If the actual expiry is deferred to task work context and the
1419	* work is already scheduled there is no point to do anything here.
1420	*/
1421	if (posix_cpu_timers_work_scheduled(tsk))
1422	return;
1423
1424	/*
1425	* The fast path checks that there are no expired thread or thread
1426	* group timers. If that's so, just return.
1427	*/
1428	if (!fastpath_timer_check(tsk))
1429	return;
1430
1431	__run_posix_cpu_timers(tsk);
1432	}
1433
1434	/*
1435	* Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1436	* The tsk->sighand->siglock must be held by the caller.
1437	*/
1438	void set_process_cpu_timer(struct task_struct tsk, unsigned* int clkid,
1439	u64 newval, u64 oldval)
1440	{
1441	u64 now, *nextevt;
1442
1443	if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
1444	return;
1445
1446	nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
1447	now = cpu_clock_sample_group(clkid, p: tsk, start: true);
1448
1449	if (oldval) {
1450	/*
1451	* We are setting itimer. The *oldval is absolute and we update
1452	* it to be relative, *newval argument is relative and we update
1453	* it to be absolute.
1454	*/
1455	if (*oldval) {
1456	if (*oldval <= now) {
1457	/ Just about to fire. /
1458	*oldval = TICK_NSEC;
1459	} else {
1460	*oldval -= now;
1461	}
1462	}
1463
1464	if (*newval)
1465	*newval += now;
1466	}
1467
1468	/*
1469	* Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
1470	* expiry cache is also used by RLIMIT_CPU!.
1471	*/
1472	if (newval < nextevt)
1473	nextevt = newval;
1474
1475	tick_dep_set_signal(tsk, bit: TICK_DEP_BIT_POSIX_TIMER);
1476	}
1477
1478	static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1479	const struct timespec64 *rqtp)
1480	{
1481	struct itimerspec64 it;
1482	struct k_itimer timer;
1483	u64 expires;
1484	int error;
1485
1486	/*
1487	* Set up a temporary timer and then wait for it to go off.
1488	*/
1489	memset(&timer, `0`, sizeof timer);
1490	spin_lock_init(&timer.it_lock);
1491	timer.it_clock = which_clock;
1492	timer.it_overrun = -`1`;
1493	error = posix_cpu_timer_create(new_timer: &timer);
1494	timer.it_process = current;
1495	timer.it.cpu.nanosleep = true;
1496
1497	if (!error) {
1498	static struct itimerspec64 zero_it;
1499	struct restart_block *restart;
1500
1501	memset(&it, `0`, sizeof(it));
1502	it.it_value = *rqtp;
1503
1504	spin_lock_irq(lock: &timer.it_lock);
1505	error = posix_cpu_timer_set(timer: &timer, timer_flags: flags, new: &it, NULL);
1506	if (error) {
1507	spin_unlock_irq(lock: &timer.it_lock);
1508	return error;
1509	}
1510
1511	while (!signal_pending(current)) {
1512	if (!cpu_timer_getexpires(ctmr: &timer.it.cpu)) {
1513	/*
1514	* Our timer fired and was reset, below
1515	* deletion can not fail.
1516	*/
1517	posix_cpu_timer_del(timer: &timer);
1518	spin_unlock_irq(lock: &timer.it_lock);
1519	return `0`;
1520	}
1521
1522	/*
1523	* Block until cpu_timer_fire (or a signal) wakes us.
1524	*/
1525	__set_current_state(TASK_INTERRUPTIBLE);
1526	spin_unlock_irq(lock: &timer.it_lock);
1527	schedule();
1528	spin_lock_irq(lock: &timer.it_lock);
1529	}
1530
1531	/*
1532	* We were interrupted by a signal.
1533	*/
1534	expires = cpu_timer_getexpires(ctmr: &timer.it.cpu);
1535	error = posix_cpu_timer_set(timer: &timer, timer_flags: `0`, new: &zero_it, old: &it);
1536	if (!error) {
1537	/ Timer is now unarmed, deletion can not fail. /
1538	posix_cpu_timer_del(timer: &timer);
1539	} else {
1540	while (error == TIMER_RETRY) {
1541	posix_cpu_timer_wait_running_nsleep(timr: &timer);
1542	error = posix_cpu_timer_del(timer: &timer);
1543	}
1544	}
1545
1546	spin_unlock_irq(lock: &timer.it_lock);
1547
1548	if ((it.it_value.tv_sec \| it.it_value.tv_nsec) == `0`) {
1549	/*
1550	* It actually did fire already.
1551	*/
1552	return `0`;
1553	}
1554
1555	error = -ERESTART_RESTARTBLOCK;
1556	/*
1557	* Report back to the user the time still remaining.
1558	*/
1559	restart = &current->restart_block;
1560	restart->nanosleep.expires = ns_to_ktime(ns: expires);
1561	if (restart->nanosleep.type != TT_NONE)
1562	error = nanosleep_copyout(restart, &it.it_value);
1563	}
1564
1565	return error;
1566	}
1567
1568	static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1569
1570	static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1571	const struct timespec64 *rqtp)
1572	{
1573	struct restart_block *restart_block = &current->restart_block;
1574	int error;
1575
1576	/*
1577	* Diagnose required errors first.
1578	*/
1579	if (CPUCLOCK_PERTHREAD(which_clock) &&
1580	(CPUCLOCK_PID(which_clock) == `0` \|\|
1581	CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
1582	return -EINVAL;
1583
1584	error = do_cpu_nanosleep(which_clock, flags, rqtp);
1585
1586	if (error == -ERESTART_RESTARTBLOCK) {
1587
1588	if (flags & TIMER_ABSTIME)
1589	return -ERESTARTNOHAND;
1590
1591	restart_block->nanosleep.clockid = which_clock;
1592	set_restart_fn(restart: restart_block, fn: posix_cpu_nsleep_restart);
1593	}
1594	return error;
1595	}
1596
1597	static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1598	{
1599	clockid_t which_clock = restart_block->nanosleep.clockid;
1600	struct timespec64 t;
1601
1602	t = ktime_to_timespec64(restart_block->nanosleep.expires);
1603
1604	return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, rqtp: &t);
1605	}
1606
1607	#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED)
1608	#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED)
1609
1610	static int process_cpu_clock_getres(const clockid_t which_clock,
1611	struct timespec64 *tp)
1612	{
1613	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1614	}
1615	static int process_cpu_clock_get(const clockid_t which_clock,
1616	struct timespec64 *tp)
1617	{
1618	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1619	}
1620	static int process_cpu_timer_create(struct k_itimer *timer)
1621	{
1622	timer->it_clock = PROCESS_CLOCK;
1623	return posix_cpu_timer_create(new_timer: timer);
1624	}
1625	static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1626	const struct timespec64 *rqtp)
1627	{
1628	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
1629	}
1630	static int thread_cpu_clock_getres(const clockid_t which_clock,
1631	struct timespec64 *tp)
1632	{
1633	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1634	}
1635	static int thread_cpu_clock_get(const clockid_t which_clock,
1636	struct timespec64 *tp)
1637	{
1638	return posix_cpu_clock_get(THREAD_CLOCK, tp);
1639	}
1640	static int thread_cpu_timer_create(struct k_itimer *timer)
1641	{
1642	timer->it_clock = THREAD_CLOCK;
1643	return posix_cpu_timer_create(new_timer: timer);
1644	}
1645
1646	const struct k_clock clock_posix_cpu = {
1647	.clock_getres = posix_cpu_clock_getres,
1648	.clock_set = posix_cpu_clock_set,
1649	.clock_get_timespec = posix_cpu_clock_get,
1650	.timer_create = posix_cpu_timer_create,
1651	.nsleep = posix_cpu_nsleep,
1652	.timer_set = posix_cpu_timer_set,
1653	.timer_del = posix_cpu_timer_del,
1654	.timer_get = posix_cpu_timer_get,
1655	.timer_rearm = posix_cpu_timer_rearm,
1656	.timer_wait_running = posix_cpu_timer_wait_running,
1657	};
1658
1659	const struct k_clock clock_process = {
1660	.clock_getres = process_cpu_clock_getres,
1661	.clock_get_timespec = process_cpu_clock_get,
1662	.timer_create = process_cpu_timer_create,
1663	.nsleep = process_cpu_nsleep,
1664	};
1665
1666	const struct k_clock clock_thread = {
1667	.clock_getres = thread_cpu_clock_getres,
1668	.clock_get_timespec = thread_cpu_clock_get,
1669	.timer_create = thread_cpu_timer_create,
1670	};
1671

source code of linux/kernel/time/posix-cpu-timers.c