pidfs.c source code [linux/fs/pidfs.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/anon_inodes.h>
3	#include <linux/exportfs.h>
4	#include <linux/file.h>
5	#include <linux/fs.h>
6	#include <linux/cgroup.h>
7	#include <linux/magic.h>
8	#include <linux/mount.h>
9	#include <linux/pid.h>
10	#include <linux/pidfs.h>
11	#include <linux/pid_namespace.h>
12	#include <linux/poll.h>
13	#include <linux/proc_fs.h>
14	#include <linux/proc_ns.h>
15	#include <linux/pseudo_fs.h>
16	#include <linux/ptrace.h>
17	#include <linux/seq_file.h>
18	#include <uapi/linux/pidfd.h>
19	#include <linux/ipc_namespace.h>
20	#include <linux/time_namespace.h>
21	#include <linux/utsname.h>
22	#include <net/net_namespace.h>
23	#include <linux/coredump.h>
24	#include <linux/xattr.h>
25
26	#include "internal.h"
27	#include "mount.h"
28
29	#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
30
31	static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
32	static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
33
34	static struct path pidfs_root_path = {};
35
36	void pidfs_get_root(struct path *path)
37	{
38	*path = pidfs_root_path;
39	path_get(path);
40	}
41
42	enum pidfs_attr_mask_bits {
43	PIDFS_ATTR_BIT_EXIT = `0`,
44	PIDFS_ATTR_BIT_COREDUMP = `1`,
45	};
46
47	struct pidfs_attr {
48	unsigned long attr_mask;
49	struct simple_xattrs *xattrs;
50	struct / exit info / {
51	__u64 cgroupid;
52	__s32 exit_code;
53	};
54	__u32 coredump_mask;
55	__u32 coredump_signal;
56	};
57
58	static struct rb_root pidfs_ino_tree = RB_ROOT;
59
60	#if BITS_PER_LONG == 32
61	static inline unsigned long pidfs_ino(u64 ino)
62	{
63	return lower_32_bits(ino);
64	}
65
66	/ On 32 bit the generation number are the upper 32 bits. /
67	static inline u32 pidfs_gen(u64 ino)
68	{
69	return upper_32_bits(ino);
70	}
71
72	#else
73
74	/ On 64 bit simply return ino. /
75	static inline unsigned long pidfs_ino(u64 ino)
76	{
77	return ino;
78	}
79
80	/ On 64 bit the generation number is 0. /
81	static inline u32 pidfs_gen(u64 ino)
82	{
83	return `0`;
84	}
85	#endif
86
87	static int pidfs_ino_cmp(struct rb_node a, const* struct rb_node *b)
88	{
89	struct pid pid_a = rb_entry(a, struct* pid, pidfs_node);
90	struct pid pid_b = rb_entry(b, struct* pid, pidfs_node);
91	u64 pid_ino_a = pid_a->ino;
92	u64 pid_ino_b = pid_b->ino;
93
94	if (pid_ino_a < pid_ino_b)
95	return -`1`;
96	if (pid_ino_a > pid_ino_b)
97	return `1`;
98	return `0`;
99	}
100
101	void pidfs_add_pid(struct pid *pid)
102	{
103	static u64 pidfs_ino_nr = `2`;
104
105	/*
106	* On 64 bit nothing special happens. The 64bit number assigned
107	* to struct pid is the inode number.
108	*
109	* On 32 bit the 64 bit number assigned to struct pid is split
110	* into two 32 bit numbers. The lower 32 bits are used as the
111	* inode number and the upper 32 bits are used as the inode
112	* generation number.
113	*
114	* On 32 bit pidfs_ino() will return the lower 32 bit. When
115	* pidfs_ino() returns zero a wrap around happened. When a
116	* wraparound happens the 64 bit number will be incremented by 2
117	* so inode numbering starts at 2 again.
118	*
119	* On 64 bit comparing two pidfds is as simple as comparing
120	* inode numbers.
121	*
122	* When a wraparound happens on 32 bit multiple pidfds with the
123	* same inode number are likely to exist (This isn't a problem
124	* since before pidfs pidfds used the anonymous inode meaning
125	* all pidfds had the same inode number.). Userspace can
126	* reconstruct the 64 bit identifier by retrieving both the
127	* inode number and the inode generation number to compare or
128	* use file handles.
129	*/
130	if (pidfs_ino(ino: pidfs_ino_nr) == `0`)
131	pidfs_ino_nr += `2`;
132
133	pid->ino = pidfs_ino_nr;
134	pid->stashed = NULL;
135	pid->attr = NULL;
136	pidfs_ino_nr++;
137
138	write_seqcount_begin(&pidmap_lock_seq);
139	rb_find_add_rcu(node: &pid->pidfs_node, tree: &pidfs_ino_tree, cmp: pidfs_ino_cmp);
140	write_seqcount_end(&pidmap_lock_seq);
141	}
142
143	void pidfs_remove_pid(struct pid *pid)
144	{
145	write_seqcount_begin(&pidmap_lock_seq);
146	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
147	write_seqcount_end(&pidmap_lock_seq);
148	}
149
150	void pidfs_free_pid(struct pid *pid)
151	{
152	struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
153	struct simple_xattrs *xattrs __free(kfree) = NULL;
154
155	/*
156	* Any dentry must've been wiped from the pid by now.
157	* Otherwise there's a reference count bug.
158	*/
159	VFS_WARN_ON_ONCE(pid->stashed);
160
161	/*
162	* This if an error occurred during e.g., task creation that
163	* causes us to never go through the exit path.
164	*/
165	if (unlikely(!attr))
166	return;
167
168	/ This never had a pidfd created. /
169	if (IS_ERR(ptr: attr))
170	return;
171
172	xattrs = no_free_ptr(attr->xattrs);
173	if (xattrs)
174	simple_xattrs_free(xattrs, NULL);
175	}
176
177	#ifdef CONFIG_PROC_FS
178	/**
179	* pidfd_show_fdinfo - print information about a pidfd
180	* @m: proc fdinfo file
181	* @f: file referencing a pidfd
182	*
183	* Pid:
184	* This function will print the pid that a given pidfd refers to in the
185	* pid namespace of the procfs instance.
186	* If the pid namespace of the process is not a descendant of the pid
187	* namespace of the procfs instance 0 will be shown as its pid. This is
188	* similar to calling getppid() on a process whose parent is outside of
189	* its pid namespace.
190	*
191	* NSpid:
192	* If pid namespaces are supported then this function will also print
193	* the pid of a given pidfd refers to for all descendant pid namespaces
194	* starting from the current pid namespace of the instance, i.e. the
195	* Pid field and the first entry in the NSpid field will be identical.
196	* If the pid namespace of the process is not a descendant of the pid
197	* namespace of the procfs instance 0 will be shown as its first NSpid
198	* entry and no others will be shown.
199	* Note that this differs from the Pid and NSpid fields in
200	* /proc/<pid>/status where Pid and NSpid are always shown relative to
201	* the pid namespace of the procfs instance. The difference becomes
202	* obvious when sending around a pidfd between pid namespaces from a
203	* different branch of the tree, i.e. where no ancestral relation is
204	* present between the pid namespaces:
205	* - create two new pid namespaces ns1 and ns2 in the initial pid
206	* namespace (also take care to create new mount namespaces in the
207	* new pid namespace and mount procfs)
208	* - create a process with a pidfd in ns1
209	* - send pidfd from ns1 to ns2
210	* - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
211	* have exactly one entry, which is 0
212	*/
213	static void pidfd_show_fdinfo(struct seq_file m, struct* file *f)
214	{
215	struct pid *pid = pidfd_pid(file: f);
216	struct pid_namespace *ns;
217	pid_t nr = -`1`;
218
219	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
220	ns = proc_pid_ns(sb: file_inode(f: m->file)->i_sb);
221	nr = pid_nr_ns(pid, ns);
222	}
223
224	seq_put_decimal_ll(m, delimiter: "Pid:\t", num: nr);
225
226	#ifdef CONFIG_PID_NS
227	seq_put_decimal_ll(m, delimiter: "\nNSpid:\t", num: nr);
228	if (nr > `0`) {
229	int i;
230
231	/ If nr is non-zero it means that 'pid' is valid and that*
232	* ns, i.e. the pid namespace associated with the procfs
233	* instance, is in the pid namespace hierarchy of pid.
234	* Start at one below the already printed level.
235	*/
236	for (i = ns->level + `1`; i <= pid->level; i++)
237	seq_put_decimal_ll(m, delimiter: "\t", num: pid->numbers[i].nr);
238	}
239	#endif
240	seq_putc(m, c: `'\n'`);
241	}
242	#endif
243
244	/*
245	* Poll support for process exit notification.
246	*/
247	static __poll_t pidfd_poll(struct file file, struct* poll_table_struct *pts)
248	{
249	struct pid *pid = pidfd_pid(file);
250	struct task_struct *task;
251	__poll_t poll_flags = `0`;
252
253	poll_wait(filp: file, wait_address: &pid->wait_pidfd, p: pts);
254	/*
255	* Don't wake waiters if the thread-group leader exited
256	* prematurely. They either get notified when the last subthread
257	* exits or not at all if one of the remaining subthreads execs
258	* and assumes the struct pid of the old thread-group leader.
259	*/
260	guard(rcu)();
261	task = pid_task(pid, PIDTYPE_PID);
262	if (!task)
263	poll_flags = EPOLLIN \| EPOLLRDNORM \| EPOLLHUP;
264	else if (task->exit_state && !delay_group_leader(task))
265	poll_flags = EPOLLIN \| EPOLLRDNORM;
266
267	return poll_flags;
268	}
269
270	static inline bool pid_in_current_pidns(const struct pid *pid)
271	{
272	const struct pid_namespace *ns = task_active_pid_ns(current);
273
274	if (ns->level <= pid->level)
275	return pid->numbers[ns->level].ns == ns;
276
277	return false;
278	}
279
280	static __u32 pidfs_coredump_mask(unsigned long mm_flags)
281	{
282	switch (__get_dumpable(mm_flags)) {
283	case SUID_DUMP_USER:
284	return PIDFD_COREDUMP_USER;
285	case SUID_DUMP_ROOT:
286	return PIDFD_COREDUMP_ROOT;
287	case SUID_DUMP_DISABLE:
288	return PIDFD_COREDUMP_SKIP;
289	default:
290	WARN_ON_ONCE(true);
291	}
292
293	return `0`;
294	}
295
296	/ This must be updated whenever a new flag is added /
297	#define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID \| \
298	PIDFD_INFO_CREDS \| \
299	PIDFD_INFO_CGROUPID \| \
300	PIDFD_INFO_EXIT \| \
301	PIDFD_INFO_COREDUMP \| \
302	PIDFD_INFO_SUPPORTED_MASK \| \
303	PIDFD_INFO_COREDUMP_SIGNAL)
304
305	static long pidfd_info(struct file file, unsigned* int cmd, unsigned long arg)
306	{
307	struct pidfd_info __user uinfo = (struct* pidfd_info __user *)arg;
308	struct task_struct *task __free(put_task) = NULL;
309	struct pid *pid = pidfd_pid(file);
310	size_t usize = _IOC_SIZE(cmd);
311	struct pidfd_info kinfo = {};
312	struct user_namespace *user_ns;
313	struct pidfs_attr *attr;
314	const struct cred *c;
315	__u64 mask;
316
317	BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2);
318
319	if (!uinfo)
320	return -EINVAL;
321	if (usize < PIDFD_INFO_SIZE_VER0)
322	return -EINVAL; / First version, no smaller struct possible /
323
324	if (copy_from_user(to: &mask, from: &uinfo->mask, n: sizeof(mask)))
325	return -EFAULT;
326
327	/*
328	* Restrict information retrieval to tasks within the caller's pid
329	* namespace hierarchy.
330	*/
331	if (!pid_in_current_pidns(pid))
332	return -ESRCH;
333
334	attr = READ_ONCE(pid->attr);
335	if (mask & PIDFD_INFO_EXIT) {
336	if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) {
337	smp_rmb();
338	kinfo.mask \|= PIDFD_INFO_EXIT;
339	#ifdef CONFIG_CGROUPS
340	kinfo.cgroupid = attr->cgroupid;
341	kinfo.mask \|= PIDFD_INFO_CGROUPID;
342	#endif
343	kinfo.exit_code = attr->exit_code;
344	}
345	}
346
347	if (mask & PIDFD_INFO_COREDUMP) {
348	if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) {
349	smp_rmb();
350	kinfo.mask \|= PIDFD_INFO_COREDUMP \| PIDFD_INFO_COREDUMP_SIGNAL;
351	kinfo.coredump_mask = attr->coredump_mask;
352	kinfo.coredump_signal = attr->coredump_signal;
353	}
354	}
355
356	task = get_pid_task(pid, PIDTYPE_PID);
357	if (!task) {
358	/*
359	* If the task has already been reaped, only exit
360	* information is available
361	*/
362	if (!(mask & PIDFD_INFO_EXIT))
363	return -ESRCH;
364
365	goto copy_out;
366	}
367
368	c = get_task_cred(task);
369	if (!c)
370	return -ESRCH;
371
372	if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) {
373	guard(task_lock)(T: task);
374	if (task->mm) {
375	unsigned long flags = __mm_flags_get_dumpable(mm: task->mm);
376
377	kinfo.coredump_mask = pidfs_coredump_mask(mm_flags: flags);
378	kinfo.mask \|= PIDFD_INFO_COREDUMP;
379	/ No coredump actually took place, so no coredump signal. /
380	}
381	}
382
383	/ Unconditionally return identifiers and credentials, the rest only on request /
384
385	user_ns = current_user_ns();
386	kinfo.ruid = from_kuid_munged(to: user_ns, uid: c->uid);
387	kinfo.rgid = from_kgid_munged(to: user_ns, gid: c->gid);
388	kinfo.euid = from_kuid_munged(to: user_ns, uid: c->euid);
389	kinfo.egid = from_kgid_munged(to: user_ns, gid: c->egid);
390	kinfo.suid = from_kuid_munged(to: user_ns, uid: c->suid);
391	kinfo.sgid = from_kgid_munged(to: user_ns, gid: c->sgid);
392	kinfo.fsuid = from_kuid_munged(to: user_ns, uid: c->fsuid);
393	kinfo.fsgid = from_kgid_munged(to: user_ns, gid: c->fsgid);
394	kinfo.mask \|= PIDFD_INFO_CREDS;
395	put_cred(cred: c);
396
397	#ifdef CONFIG_CGROUPS
398	if (!kinfo.cgroupid) {
399	struct cgroup *cgrp;
400
401	rcu_read_lock();
402	cgrp = task_dfl_cgroup(task);
403	kinfo.cgroupid = cgroup_id(cgrp);
404	kinfo.mask \|= PIDFD_INFO_CGROUPID;
405	rcu_read_unlock();
406	}
407	#endif
408
409	/*
410	* Copy pid/tgid last, to reduce the chances the information might be
411	* stale. Note that it is not possible to ensure it will be valid as the
412	* task might return as soon as the copy_to_user finishes, but that's ok
413	* and userspace expects that might happen and can act accordingly, so
414	* this is just best-effort. What we can do however is checking that all
415	* the fields are set correctly, or return ESRCH to avoid providing
416	* incomplete information. */
417
418	kinfo.ppid = task_ppid_nr_ns(tsk: task, NULL);
419	kinfo.tgid = task_tgid_vnr(tsk: task);
420	kinfo.pid = task_pid_vnr(tsk: task);
421	kinfo.mask \|= PIDFD_INFO_PID;
422
423	if (kinfo.pid == `0` \|\| kinfo.tgid == `0`)
424	return -ESRCH;
425
426	copy_out:
427	if (mask & PIDFD_INFO_SUPPORTED_MASK) {
428	kinfo.mask \|= PIDFD_INFO_SUPPORTED_MASK;
429	kinfo.supported_mask = PIDFD_INFO_SUPPORTED;
430	}
431
432	/ Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? /
433	WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask);
434	/*
435	* If userspace and the kernel have the same struct size it can just
436	* be copied. If userspace provides an older struct, only the bits that
437	* userspace knows about will be copied. If userspace provides a new
438	* struct, only the bits that the kernel knows about will be copied.
439	*/
440	return copy_struct_to_user(dst: uinfo, usize, src: &kinfo, ksize: sizeof(kinfo), NULL);
441	}
442
443	static bool pidfs_ioctl_valid(unsigned int cmd)
444	{
445	switch (cmd) {
446	case FS_IOC_GETVERSION:
447	case PIDFD_GET_CGROUP_NAMESPACE:
448	case PIDFD_GET_IPC_NAMESPACE:
449	case PIDFD_GET_MNT_NAMESPACE:
450	case PIDFD_GET_NET_NAMESPACE:
451	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
452	case PIDFD_GET_TIME_NAMESPACE:
453	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
454	case PIDFD_GET_UTS_NAMESPACE:
455	case PIDFD_GET_USER_NAMESPACE:
456	case PIDFD_GET_PID_NAMESPACE:
457	return true;
458	}
459
460	/ Extensible ioctls require some more careful checks. /
461	switch (_IOC_NR(cmd)) {
462	case _IOC_NR(PIDFD_GET_INFO):
463	/*
464	* Try to prevent performing a pidfd ioctl when someone
465	* erronously mistook the file descriptor for a pidfd.
466	* This is not perfect but will catch most cases.
467	*/
468	return extensible_ioctl_valid(cmd_a: cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
469	}
470
471	return false;
472	}
473
474	static long pidfd_ioctl(struct file file, unsigned* int cmd, unsigned long arg)
475	{
476	struct task_struct *task __free(put_task) = NULL;
477	struct nsproxy *nsp __free(put_nsproxy) = NULL;
478	struct ns_common *ns_common = NULL;
479
480	if (!pidfs_ioctl_valid(cmd))
481	return -ENOIOCTLCMD;
482
483	if (cmd == FS_IOC_GETVERSION) {
484	if (!arg)
485	return -EINVAL;
486
487	__u32 __user argp = (__u32 __user )arg;
488	return put_user(file_inode(file)->i_generation, argp);
489	}
490
491	/ Extensible IOCTL that does not open namespace FDs, take a shortcut /
492	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
493	return pidfd_info(file, cmd, arg);
494
495	task = get_pid_task(pid: pidfd_pid(file), PIDTYPE_PID);
496	if (!task)
497	return -ESRCH;
498
499	if (arg)
500	return -EINVAL;
501
502	scoped_guard(task_lock, task) {
503	nsp = task->nsproxy;
504	if (nsp)
505	get_nsproxy(ns: nsp);
506	}
507	if (!nsp)
508	return -ESRCH; / just pretend it didn't exist /
509
510	/*
511	* We're trying to open a file descriptor to the namespace so perform a
512	* filesystem cred ptrace check. Also, we mirror nsfs behavior.
513	*/
514	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
515	return -EACCES;
516
517	switch (cmd) {
518	/ Namespaces that hang of nsproxy. /
519	case PIDFD_GET_CGROUP_NAMESPACE:
520	#ifdef CONFIG_CGROUPS
521	if (!ns_ref_get(nsp->cgroup_ns))
522	break;
523	ns_common = to_ns_common(nsp->cgroup_ns);
524	#endif
525	break;
526	case PIDFD_GET_IPC_NAMESPACE:
527	#ifdef CONFIG_IPC_NS
528	if (!ns_ref_get(nsp->ipc_ns))
529	break;
530	ns_common = to_ns_common(nsp->ipc_ns);
531	#endif
532	break;
533	case PIDFD_GET_MNT_NAMESPACE:
534	if (!ns_ref_get(nsp->mnt_ns))
535	break;
536	ns_common = to_ns_common(nsp->mnt_ns);
537	break;
538	case PIDFD_GET_NET_NAMESPACE:
539	#ifdef CONFIG_NET_NS
540	if (!ns_ref_get(nsp->net_ns))
541	break;
542	ns_common = to_ns_common(nsp->net_ns);
543	#endif
544	break;
545	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
546	#ifdef CONFIG_PID_NS
547	if (!ns_ref_get(nsp->pid_ns_for_children))
548	break;
549	ns_common = to_ns_common(nsp->pid_ns_for_children);
550	#endif
551	break;
552	case PIDFD_GET_TIME_NAMESPACE:
553	#ifdef CONFIG_TIME_NS
554	if (!ns_ref_get(nsp->time_ns))
555	break;
556	ns_common = to_ns_common(nsp->time_ns);
557	#endif
558	break;
559	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
560	#ifdef CONFIG_TIME_NS
561	if (!ns_ref_get(nsp->time_ns_for_children))
562	break;
563	ns_common = to_ns_common(nsp->time_ns_for_children);
564	#endif
565	break;
566	case PIDFD_GET_UTS_NAMESPACE:
567	#ifdef CONFIG_UTS_NS
568	if (!ns_ref_get(nsp->uts_ns))
569	break;
570	ns_common = to_ns_common(nsp->uts_ns);
571	#endif
572	break;
573	/ Namespaces that don't hang of nsproxy. /
574	case PIDFD_GET_USER_NAMESPACE:
575	#ifdef CONFIG_USER_NS
576	scoped_guard(rcu) {
577	struct user_namespace *user_ns;
578
579	user_ns = task_cred_xxx(task, user_ns);
580	if (!ns_ref_get(user_ns))
581	break;
582	ns_common = to_ns_common(user_ns);
583	}
584	#endif
585	break;
586	case PIDFD_GET_PID_NAMESPACE:
587	#ifdef CONFIG_PID_NS
588	scoped_guard(rcu) {
589	struct pid_namespace *pid_ns;
590
591	pid_ns = task_active_pid_ns(tsk: task);
592	if (!ns_ref_get(pid_ns))
593	break;
594	ns_common = to_ns_common(pid_ns);
595	}
596	#endif
597	break;
598	default:
599	return -ENOIOCTLCMD;
600	}
601
602	if (!ns_common)
603	return -EOPNOTSUPP;
604
605	/ open_namespace() unconditionally consumes the reference /
606	return open_namespace(ns: ns_common);
607	}
608
609	static const struct file_operations pidfs_file_operations = {
610	.poll = pidfd_poll,
611	#ifdef CONFIG_PROC_FS
612	.show_fdinfo = pidfd_show_fdinfo,
613	#endif
614	.unlocked_ioctl = pidfd_ioctl,
615	.compat_ioctl = compat_ptr_ioctl,
616	};
617
618	struct pid pidfd_pid(const* struct file *file)
619	{
620	if (file->f_op != &pidfs_file_operations)
621	return ERR_PTR(error: -EBADF);
622	return file_inode(f: file)->i_private;
623	}
624
625	/*
626	* We're called from release_task(). We know there's at least one
627	* reference to struct pid being held that won't be released until the
628	* task has been reaped which cannot happen until we're out of
629	* release_task().
630	*
631	* If this struct pid has at least once been referred to by a pidfd then
632	* pid->attr will be allocated. If not we mark the struct pid as dead so
633	* anyone who is trying to register it with pidfs will fail to do so.
634	* Otherwise we would hand out pidfs for reaped tasks without having
635	* exit information available.
636	*
637	* Worst case is that we've filled in the info and the pid gets freed
638	* right away in free_pid() when no one holds a pidfd anymore. Since
639	* pidfs_exit() currently is placed after exit_task_work() we know that
640	* it cannot be us aka the exiting task holding a pidfd to itself.
641	*/
642	void pidfs_exit(struct task_struct *tsk)
643	{
644	struct pid *pid = task_pid(task: tsk);
645	struct pidfs_attr *attr;
646	#ifdef CONFIG_CGROUPS
647	struct cgroup *cgrp;
648	#endif
649
650	might_sleep();
651
652	/ Synchronize with pidfs_register_pid(). /
653	scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
654	attr = pid->attr;
655	if (!attr) {
656	/*
657	* No one ever held a pidfd for this struct pid.
658	* Mark it as dead so no one can add a pidfs
659	* entry anymore. We're about to be reaped and
660	* so no exit information would be available.
661	*/
662	pid->attr = PIDFS_PID_DEAD;
663	return;
664	}
665	}
666
667	/*
668	* If @pid->attr is set someone might still legitimately hold a
669	* pidfd to @pid or someone might concurrently still be getting
670	* a reference to an already stashed dentry from @pid->stashed.
671	* So defer cleaning @pid->attr until the last reference to @pid
672	* is put
673	*/
674
675	#ifdef CONFIG_CGROUPS
676	rcu_read_lock();
677	cgrp = task_dfl_cgroup(task: tsk);
678	attr->cgroupid = cgroup_id(cgrp);
679	rcu_read_unlock();
680	#endif
681	attr->exit_code = tsk->exit_code;
682
683	/ Ensure that PIDFD_GET_INFO sees either all or nothing. /
684	smp_wmb();
685	set_bit(nr: PIDFS_ATTR_BIT_EXIT, addr: &attr->attr_mask);
686	}
687
688	#ifdef CONFIG_COREDUMP
689	void pidfs_coredump(const struct coredump_params *cprm)
690	{
691	struct pid *pid = cprm->pid;
692	struct pidfs_attr *attr;
693
694	attr = READ_ONCE(pid->attr);
695
696	VFS_WARN_ON_ONCE(!attr);
697	VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
698
699	/ Note how we were coredumped and that we coredumped. /
700	attr->coredump_mask = pidfs_coredump_mask(mm_flags: cprm->mm_flags) \|
701	PIDFD_COREDUMPED;
702	/ If coredumping is set to skip we should never end up here. /
703	VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
704	/ Expose the signal number that caused the coredump. /
705	attr->coredump_signal = cprm->siginfo->si_signo;
706	smp_wmb();
707	set_bit(nr: PIDFS_ATTR_BIT_COREDUMP, addr: &attr->attr_mask);
708	}
709	#endif
710
711	static struct vfsmount *pidfs_mnt __ro_after_init;
712
713	/*
714	* The vfs falls back to simple_setattr() if i_op->setattr() isn't
715	* implemented. Let's reject it completely until we have a clean
716	* permission concept for pidfds.
717	*/
718	static int pidfs_setattr(struct mnt_idmap idmap, struct* dentry *dentry,
719	struct iattr *attr)
720	{
721	return anon_inode_setattr(idmap, dentry, attr);
722	}
723
724	static int pidfs_getattr(struct mnt_idmap idmap, const* struct path *path,
725	struct kstat *stat, u32 request_mask,
726	unsigned int query_flags)
727	{
728	return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
729	}
730
731	static ssize_t pidfs_listxattr(struct dentry dentry, char* *buf, size_t size)
732	{
733	struct inode *inode = d_inode(dentry);
734	struct pid *pid = inode->i_private;
735	struct pidfs_attr *attr = pid->attr;
736	struct simple_xattrs *xattrs;
737
738	xattrs = READ_ONCE(attr->xattrs);
739	if (!xattrs)
740	return `0`;
741
742	return simple_xattr_list(inode, xattrs, buffer: buf, size);
743	}
744
745	static const struct inode_operations pidfs_inode_operations = {
746	.getattr = pidfs_getattr,
747	.setattr = pidfs_setattr,
748	.listxattr = pidfs_listxattr,
749	};
750
751	static void pidfs_evict_inode(struct inode *inode)
752	{
753	struct pid *pid = inode->i_private;
754
755	clear_inode(inode);
756	put_pid(pid);
757	}
758
759	static const struct super_operations pidfs_sops = {
760	.drop_inode = inode_just_drop,
761	.evict_inode = pidfs_evict_inode,
762	.statfs = simple_statfs,
763	};
764
765	/*
766	* 'lsof' has knowledge of out historical anon_inode use, and expects
767	* the pidfs dentry name to start with 'anon_inode'.
768	*/
769	static char pidfs_dname(struct* dentry dentry, char* buffer, int* buflen)
770	{
771	return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
772	}
773
774	const struct dentry_operations pidfs_dentry_operations = {
775	.d_dname = pidfs_dname,
776	.d_prune = stashed_dentry_prune,
777	};
778
779	static int pidfs_encode_fh(struct inode inode, u32 fh, int *max_len,
780	struct inode *parent)
781	{
782	const struct pid *pid = inode->i_private;
783
784	if (*max_len < `2`) {
785	*max_len = `2`;
786	return FILEID_INVALID;
787	}
788
789	*max_len = `2`;
790	(u64 )fh = pid->ino;
791	return FILEID_KERNFS;
792	}
793
794	static int pidfs_ino_find(const void key, const* struct rb_node *node)
795	{
796	const u64 pid_ino = (u64 )key;
797	const struct pid pid = rb_entry(node, struct* pid, pidfs_node);
798
799	if (pid_ino < pid->ino)
800	return -`1`;
801	if (pid_ino > pid->ino)
802	return `1`;
803	return `0`;
804	}
805
806	/ Find a struct pid based on the inode number. /
807	static struct pid *pidfs_ino_get_pid(u64 ino)
808	{
809	struct pid *pid;
810	struct rb_node *node;
811	unsigned int seq;
812
813	guard(rcu)();
814	do {
815	seq = read_seqcount_begin(&pidmap_lock_seq);
816	node = rb_find_rcu(key: &ino, tree: &pidfs_ino_tree, cmp: pidfs_ino_find);
817	if (node)
818	break;
819	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
820
821	if (!node)
822	return NULL;
823
824	pid = rb_entry(node, struct pid, pidfs_node);
825
826	/ Within our pid namespace hierarchy? /
827	if (pid_vnr(pid) == `0`)
828	return NULL;
829
830	return get_pid(pid);
831	}
832
833	static struct dentry pidfs_fh_to_dentry(struct* super_block *sb,
834	struct fid fid, int* fh_len,
835	int fh_type)
836	{
837	int ret;
838	u64 pid_ino;
839	struct path path;
840	struct pid *pid;
841
842	if (fh_len < `2`)
843	return NULL;
844
845	switch (fh_type) {
846	case FILEID_KERNFS:
847	pid_ino = (u64 )fid;
848	break;
849	default:
850	return NULL;
851	}
852
853	pid = pidfs_ino_get_pid(ino: pid_ino);
854	if (!pid)
855	return NULL;
856
857	ret = path_from_stashed(stashed: &pid->stashed, mnt: pidfs_mnt, data: pid, path: &path);
858	if (ret < `0`)
859	return ERR_PTR(error: ret);
860
861	VFS_WARN_ON_ONCE(!pid->attr);
862
863	mntput(mnt: path.mnt);
864	return path.dentry;
865	}
866
867	/*
868	* Make sure that we reject any nonsensical flags that users pass via
869	* open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
870	* PIDFD_NONBLOCK as O_NONBLOCK.
871	*/
872	#define VALID_FILE_HANDLE_OPEN_FLAGS \
873	(O_RDONLY \| O_WRONLY \| O_RDWR \| O_NONBLOCK \| O_CLOEXEC \| O_EXCL)
874
875	static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
876	unsigned int oflags)
877	{
878	if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS \| O_LARGEFILE))
879	return -EINVAL;
880
881	/*
882	* pidfd_ino_get_pid() will verify that the struct pid is part
883	* of the caller's pid namespace hierarchy. No further
884	* permission checks are needed.
885	*/
886	return `0`;
887	}
888
889	static struct file pidfs_export_open(const* struct path path, unsigned* int oflags)
890	{
891	/*
892	* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
893	* O_RDWR as pidfds always are.
894	*/
895	oflags &= ~O_LARGEFILE;
896	return dentry_open(path, flags: oflags \| O_RDWR, current_cred());
897	}
898
899	static const struct export_operations pidfs_export_operations = {
900	.encode_fh = pidfs_encode_fh,
901	.fh_to_dentry = pidfs_fh_to_dentry,
902	.open = pidfs_export_open,
903	.permission = pidfs_export_permission,
904	};
905
906	static int pidfs_init_inode(struct inode inode, void* *data)
907	{
908	const struct pid *pid = data;
909
910	inode->i_private = data;
911	inode->i_flags \|= S_PRIVATE \| S_ANON_INODE;
912	/ We allow to set xattrs. /
913	inode->i_flags &= ~S_IMMUTABLE;
914	inode->i_mode \|= S_IRWXU;
915	inode->i_op = &pidfs_inode_operations;
916	inode->i_fop = &pidfs_file_operations;
917	inode->i_ino = pidfs_ino(ino: pid->ino);
918	inode->i_generation = pidfs_gen(ino: pid->ino);
919	return `0`;
920	}
921
922	static void pidfs_put_data(void *data)
923	{
924	struct pid *pid = data;
925	put_pid(pid);
926	}
927
928	/**
929	* pidfs_register_pid - register a struct pid in pidfs
930	* @pid: pid to pin
931	*
932	* Register a struct pid in pidfs.
933	*
934	* Return: On success zero, on error a negative error code is returned.
935	*/
936	int pidfs_register_pid(struct pid *pid)
937	{
938	struct pidfs_attr *new_attr __free(kfree) = NULL;
939	struct pidfs_attr *attr;
940
941	might_sleep();
942
943	if (!pid)
944	return `0`;
945
946	attr = READ_ONCE(pid->attr);
947	if (unlikely(attr == PIDFS_PID_DEAD))
948	return PTR_ERR(PIDFS_PID_DEAD);
949	if (attr)
950	return `0`;
951
952	new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
953	if (!new_attr)
954	return -ENOMEM;
955
956	/ Synchronize with pidfs_exit(). /
957	guard(spinlock_irq)(l: &pid->wait_pidfd.lock);
958
959	attr = pid->attr;
960	if (unlikely(attr == PIDFS_PID_DEAD))
961	return PTR_ERR(PIDFS_PID_DEAD);
962	if (unlikely(attr))
963	return `0`;
964
965	pid->attr = no_free_ptr(new_attr);
966	return `0`;
967	}
968
969	static struct dentry pidfs_stash_dentry(struct* dentry **stashed,
970	struct dentry *dentry)
971	{
972	int ret;
973	struct pid *pid = d_inode(dentry)->i_private;
974
975	VFS_WARN_ON_ONCE(stashed != &pid->stashed);
976
977	ret = pidfs_register_pid(pid);
978	if (ret)
979	return ERR_PTR(error: ret);
980
981	return stash_dentry(stashed, dentry);
982	}
983
984	static const struct stashed_operations pidfs_stashed_ops = {
985	.stash_dentry = pidfs_stash_dentry,
986	.init_inode = pidfs_init_inode,
987	.put_data = pidfs_put_data,
988	};
989
990	static int pidfs_xattr_get(const struct xattr_handler *handler,
991	struct dentry unused, struct* inode *inode,
992	const char suffix, void* *value, size_t size)
993	{
994	struct pid *pid = inode->i_private;
995	struct pidfs_attr *attr = pid->attr;
996	const char *name;
997	struct simple_xattrs *xattrs;
998
999	xattrs = READ_ONCE(attr->xattrs);
1000	if (!xattrs)
1001	return `0`;
1002
1003	name = xattr_full_name(handler, suffix);
1004	return simple_xattr_get(xattrs, name, buffer: value, size);
1005	}
1006
1007	static int pidfs_xattr_set(const struct xattr_handler *handler,
1008	struct mnt_idmap idmap, struct* dentry *unused,
1009	struct inode inode, const* char *suffix,
1010	const void value, size_t size, int* flags)
1011	{
1012	struct pid *pid = inode->i_private;
1013	struct pidfs_attr *attr = pid->attr;
1014	const char *name;
1015	struct simple_xattrs *xattrs;
1016	struct simple_xattr *old_xattr;
1017
1018	/ Ensure we're the only one to set @attr->xattrs. /
1019	WARN_ON_ONCE(!inode_is_locked(inode));
1020
1021	xattrs = READ_ONCE(attr->xattrs);
1022	if (!xattrs) {
1023	xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
1024	if (!xattrs)
1025	return -ENOMEM;
1026
1027	simple_xattrs_init(xattrs);
1028	smp_store_release(&pid->attr->xattrs, xattrs);
1029	}
1030
1031	name = xattr_full_name(handler, suffix);
1032	old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
1033	if (IS_ERR(ptr: old_xattr))
1034	return PTR_ERR(ptr: old_xattr);
1035
1036	simple_xattr_free(xattr: old_xattr);
1037	return `0`;
1038	}
1039
1040	static const struct xattr_handler pidfs_trusted_xattr_handler = {
1041	.prefix = XATTR_TRUSTED_PREFIX,
1042	.get = pidfs_xattr_get,
1043	.set = pidfs_xattr_set,
1044	};
1045
1046	static const struct xattr_handler *const pidfs_xattr_handlers[] = {
1047	&pidfs_trusted_xattr_handler,
1048	NULL
1049	};
1050
1051	static int pidfs_init_fs_context(struct fs_context *fc)
1052	{
1053	struct pseudo_fs_context *ctx;
1054
1055	ctx = init_pseudo(fc, PID_FS_MAGIC);
1056	if (!ctx)
1057	return -ENOMEM;
1058
1059	fc->s_iflags \|= SB_I_NOEXEC;
1060	fc->s_iflags \|= SB_I_NODEV;
1061	ctx->s_d_flags \|= DCACHE_DONTCACHE;
1062	ctx->ops = &pidfs_sops;
1063	ctx->eops = &pidfs_export_operations;
1064	ctx->dops = &pidfs_dentry_operations;
1065	ctx->xattr = pidfs_xattr_handlers;
1066	fc->s_fs_info = (void *)&pidfs_stashed_ops;
1067	return `0`;
1068	}
1069
1070	static struct file_system_type pidfs_type = {
1071	.name = "pidfs",
1072	.init_fs_context = pidfs_init_fs_context,
1073	.kill_sb = kill_anon_super,
1074	};
1075
1076	struct file pidfs_alloc_file(struct* pid pid, unsigned* int flags)
1077	{
1078	struct file *pidfd_file;
1079	struct path path __free(path_put) = {};
1080	int ret;
1081
1082	/*
1083	* Ensure that PIDFD_STALE can be passed as a flag without
1084	* overloading other uapi pidfd flags.
1085	*/
1086	BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
1087	BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
1088
1089	ret = path_from_stashed(stashed: &pid->stashed, mnt: pidfs_mnt, data: get_pid(pid), path: &path);
1090	if (ret < `0`)
1091	return ERR_PTR(error: ret);
1092
1093	VFS_WARN_ON_ONCE(!pid->attr);
1094
1095	flags &= ~PIDFD_STALE;
1096	flags \|= O_RDWR;
1097	pidfd_file = dentry_open(path: &path, flags, current_cred());
1098	/ Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. /
1099	if (!IS_ERR(ptr: pidfd_file))
1100	pidfd_file->f_flags \|= (flags & PIDFD_THREAD);
1101
1102	return pidfd_file;
1103	}
1104
1105	void __init pidfs_init(void)
1106	{
1107	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), `0`,
1108	(SLAB_HWCACHE_ALIGN \| SLAB_RECLAIM_ACCOUNT \|
1109	SLAB_ACCOUNT \| SLAB_PANIC), NULL);
1110
1111	pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
1112	sizeof(struct simple_xattrs), `0`,
1113	(SLAB_HWCACHE_ALIGN \| SLAB_RECLAIM_ACCOUNT \|
1114	SLAB_ACCOUNT \| SLAB_PANIC), NULL);
1115
1116	pidfs_mnt = kern_mount(&pidfs_type);
1117	if (IS_ERR(ptr: pidfs_mnt))
1118	panic(fmt: "Failed to mount pidfs pseudo filesystem");
1119
1120	pidfs_root_path.mnt = pidfs_mnt;
1121	pidfs_root_path.dentry = pidfs_mnt->mnt_root;
1122	}
1123

source code of linux/fs/pidfs.c