namespace.c source code [linux/fs/namespace.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/namespace.c
4	*
5	* (C) Copyright Al Viro 2000, 2001
6	*
7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
8	* Heavily rewritten.
9	*/
10
11	#include <linux/syscalls.h>
12	#include <linux/export.h>
13	#include <linux/capability.h>
14	#include <linux/mnt_namespace.h>
15	#include <linux/user_namespace.h>
16	#include <linux/namei.h>
17	#include <linux/security.h>
18	#include <linux/cred.h>
19	#include <linux/idr.h>
20	#include <linux/init.h> /* init_rootfs */
21	#include <linux/fs_struct.h> /* get_fs_root et.al. */
22	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23	#include <linux/file.h>
24	#include <linux/uaccess.h>
25	#include <linux/proc_ns.h>
26	#include <linux/magic.h>
27	#include <linux/memblock.h>
28	#include <linux/proc_fs.h>
29	#include <linux/task_work.h>
30	#include <linux/sched/task.h>
31	#include <uapi/linux/mount.h>
32	#include <linux/fs_context.h>
33	#include <linux/shmem_fs.h>
34	#include <linux/mnt_idmapping.h>
35	#include <linux/pidfs.h>
36	#include <linux/nstree.h>
37
38	#include "pnode.h"
39	#include "internal.h"
40
41	/ Maximum number of mounts in a mount namespace /
42	static unsigned int sysctl_mount_max __read_mostly = `100000`;
43
44	static unsigned int m_hash_mask __ro_after_init;
45	static unsigned int m_hash_shift __ro_after_init;
46	static unsigned int mp_hash_mask __ro_after_init;
47	static unsigned int mp_hash_shift __ro_after_init;
48
49	static __initdata unsigned long mhash_entries;
50	static int __init set_mhash_entries(char *str)
51	{
52	if (!str)
53	return `0`;
54	mhash_entries = simple_strtoul(str, &str, `0`);
55	return `1`;
56	}
57	__setup("mhash_entries=", set_mhash_entries);
58
59	static __initdata unsigned long mphash_entries;
60	static int __init set_mphash_entries(char *str)
61	{
62	if (!str)
63	return `0`;
64	mphash_entries = simple_strtoul(str, &str, `0`);
65	return `1`;
66	}
67	__setup("mphash_entries=", set_mphash_entries);
68
69	static char * __initdata initramfs_options;
70	static int __init initramfs_options_setup(char *str)
71	{
72	initramfs_options = str;
73	return `1`;
74	}
75
76	__setup("initramfs_options=", initramfs_options_setup);
77
78	static u64 event;
79	static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
80	static DEFINE_IDA(mnt_group_ida);
81
82	/ Don't allow confusion with old 32bit mount ID /
83	#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
84	static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
85
86	static struct hlist_head *mount_hashtable __ro_after_init;
87	static struct hlist_head *mountpoint_hashtable __ro_after_init;
88	static struct kmem_cache *mnt_cache __ro_after_init;
89	static DECLARE_RWSEM(namespace_sem);
90	static HLIST_HEAD(unmounted); / protected by namespace_sem /
91	static LIST_HEAD(ex_mountpoints); / protected by namespace_sem /
92	static struct mnt_namespace emptied_ns; /* protected by namespace_sem /
93
94	static inline void namespace_lock(void);
95	static void namespace_unlock(void);
96	DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock())
97	DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem),
98	up_read(&namespace_sem))
99
100	DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T))
101
102	#ifdef CONFIG_FSNOTIFY
103	LIST_HEAD(notify_list); / protected by namespace_sem /
104	#endif
105
106	enum mount_kattr_flags_t {
107	MOUNT_KATTR_RECURSE = (`1` << `0`),
108	MOUNT_KATTR_IDMAP_REPLACE = (`1` << `1`),
109	};
110
111	struct mount_kattr {
112	unsigned int attr_set;
113	unsigned int attr_clr;
114	unsigned int propagation;
115	unsigned int lookup_flags;
116	enum mount_kattr_flags_t kflags;
117	struct user_namespace *mnt_userns;
118	struct mnt_idmap *mnt_idmap;
119	};
120
121	/ /sys/fs /
122	struct kobject *fs_kobj __ro_after_init;
123	EXPORT_SYMBOL_GPL(fs_kobj);
124
125	/*
126	* vfsmount lock may be taken for read to prevent changes to the
127	* vfsmount hash, ie. during mountpoint lookups or walking back
128	* up the tree.
129	*
130	* It should be taken for write in all cases where the vfsmount
131	* tree or hash is modified or when a vfsmount structure is modified.
132	*/
133	__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
134
135	static void mnt_ns_release(struct mnt_namespace *ns)
136	{
137	/ keep alive for {list,stat}mount() /
138	if (ns && refcount_dec_and_test(r: &ns->passive)) {
139	fsnotify_mntns_delete(mntns: ns);
140	put_user_ns(ns: ns->user_ns);
141	kfree(objp: ns);
142	}
143	}
144	DEFINE_FREE(mnt_ns_release, struct mnt_namespace *,
145	if (!IS_ERR(_T)) mnt_ns_release(_T))
146
147	static void mnt_ns_release_rcu(struct rcu_head *rcu)
148	{
149	mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
150	}
151
152	static void mnt_ns_tree_remove(struct mnt_namespace *ns)
153	{
154	/ remove from global mount namespace list /
155	if (ns_tree_active(ns))
156	ns_tree_remove(ns);
157
158	call_rcu(head: &ns->ns.ns_rcu, func: mnt_ns_release_rcu);
159	}
160
161	/*
162	* Lookup a mount namespace by id and take a passive reference count. Taking a
163	* passive reference means the mount namespace can be emptied if e.g., the last
164	* task holding an active reference exits. To access the mounts of the
165	* namespace the @namespace_sem must first be acquired. If the namespace has
166	* already shut down before acquiring @namespace_sem, {list,stat}mount() will
167	* see that the mount rbtree of the namespace is empty.
168	*
169	* Note the lookup is lockless protected by a sequence counter. We only
170	* need to guard against false negatives as false positives aren't
171	* possible. So if we didn't find a mount namespace and the sequence
172	* counter has changed we need to retry. If the sequence counter is
173	* still the same we know the search actually failed.
174	*/
175	static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
176	{
177	struct mnt_namespace *mnt_ns;
178	struct ns_common *ns;
179
180	guard(rcu)();
181	ns = ns_tree_lookup_rcu(ns_id: mnt_ns_id, CLONE_NEWNS);
182	if (!ns)
183	return NULL;
184
185	/*
186	* The last reference count is put with RCU delay so we can
187	* unconditonally acquire a reference here.
188	*/
189	mnt_ns = container_of(ns, struct mnt_namespace, ns);
190	refcount_inc(r: &mnt_ns->passive);
191	return mnt_ns;
192	}
193
194	static inline void lock_mount_hash(void)
195	{
196	write_seqlock(sl: &mount_lock);
197	}
198
199	static inline void unlock_mount_hash(void)
200	{
201	write_sequnlock(sl: &mount_lock);
202	}
203
204	static inline struct hlist_head m_hash(struct* vfsmount mnt, struct* dentry *dentry)
205	{
206	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
207	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
208	tmp = tmp + (tmp >> m_hash_shift);
209	return &mount_hashtable[tmp & m_hash_mask];
210	}
211
212	static inline struct hlist_head mp_hash(struct* dentry *dentry)
213	{
214	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
215	tmp = tmp + (tmp >> mp_hash_shift);
216	return &mountpoint_hashtable[tmp & mp_hash_mask];
217	}
218
219	static int mnt_alloc_id(struct mount *mnt)
220	{
221	int res;
222
223	xa_lock(&mnt_id_xa);
224	res = __xa_alloc(&mnt_id_xa, id: &mnt->mnt_id, entry: mnt, XA_LIMIT(`1`, INT_MAX), GFP_KERNEL);
225	if (!res)
226	mnt->mnt_id_unique = ++mnt_id_ctr;
227	xa_unlock(&mnt_id_xa);
228	return res;
229	}
230
231	static void mnt_free_id(struct mount *mnt)
232	{
233	xa_erase(&mnt_id_xa, index: mnt->mnt_id);
234	}
235
236	/*
237	* Allocate a new peer group ID
238	*/
239	static int mnt_alloc_group_id(struct mount *mnt)
240	{
241	int res = ida_alloc_min(ida: &mnt_group_ida, min: `1`, GFP_KERNEL);
242
243	if (res < `0`)
244	return res;
245	mnt->mnt_group_id = res;
246	return `0`;
247	}
248
249	/*
250	* Release a peer group ID
251	*/
252	void mnt_release_group_id(struct mount *mnt)
253	{
254	ida_free(&mnt_group_ida, id: mnt->mnt_group_id);
255	mnt->mnt_group_id = `0`;
256	}
257
258	/*
259	* vfsmount lock must be held for read
260	*/
261	static inline void mnt_add_count(struct mount mnt, int* n)
262	{
263	#ifdef CONFIG_SMP
264	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
265	#else
266	preempt_disable();
267	mnt->mnt_count += n;
268	preempt_enable();
269	#endif
270	}
271
272	/*
273	* vfsmount lock must be held for write
274	*/
275	int mnt_get_count(struct mount *mnt)
276	{
277	#ifdef CONFIG_SMP
278	int count = `0`;
279	int cpu;
280
281	for_each_possible_cpu(cpu) {
282	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
283	}
284
285	return count;
286	#else
287	return mnt->mnt_count;
288	#endif
289	}
290
291	static struct mount alloc_vfsmnt(const* char *name)
292	{
293	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
294	if (mnt) {
295	int err;
296
297	err = mnt_alloc_id(mnt);
298	if (err)
299	goto out_free_cache;
300
301	if (name)
302	mnt->mnt_devname = kstrdup_const(s: name,
303	GFP_KERNEL_ACCOUNT);
304	else
305	mnt->mnt_devname = "none";
306	if (!mnt->mnt_devname)
307	goto out_free_id;
308
309	#ifdef CONFIG_SMP
310	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
311	if (!mnt->mnt_pcp)
312	goto out_free_devname;
313
314	this_cpu_add(mnt->mnt_pcp->mnt_count, `1`);
315	#else
316	mnt->mnt_count = `1`;
317	mnt->mnt_writers = `0`;
318	#endif
319
320	INIT_HLIST_NODE(h: &mnt->mnt_hash);
321	INIT_LIST_HEAD(list: &mnt->mnt_child);
322	INIT_LIST_HEAD(list: &mnt->mnt_mounts);
323	INIT_LIST_HEAD(list: &mnt->mnt_list);
324	INIT_LIST_HEAD(list: &mnt->mnt_expire);
325	INIT_LIST_HEAD(list: &mnt->mnt_share);
326	INIT_HLIST_HEAD(&mnt->mnt_slave_list);
327	INIT_HLIST_NODE(h: &mnt->mnt_slave);
328	INIT_HLIST_NODE(h: &mnt->mnt_mp_list);
329	INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
330	RB_CLEAR_NODE(&mnt->mnt_node);
331	mnt->mnt.mnt_idmap = &nop_mnt_idmap;
332	}
333	return mnt;
334
335	#ifdef CONFIG_SMP
336	out_free_devname:
337	kfree_const(x: mnt->mnt_devname);
338	#endif
339	out_free_id:
340	mnt_free_id(mnt);
341	out_free_cache:
342	kmem_cache_free(s: mnt_cache, objp: mnt);
343	return NULL;
344	}
345
346	/*
347	* Most r/o checks on a fs are for operations that take
348	* discrete amounts of time, like a write() or unlink().
349	* We must keep track of when those operations start
350	* (for permission checks) and when they end, so that
351	* we can determine when writes are able to occur to
352	* a filesystem.
353	*/
354	/*
355	* __mnt_is_readonly: check whether a mount is read-only
356	* @mnt: the mount to check for its write status
357	*
358	* This shouldn't be used directly ouside of the VFS.
359	* It does not guarantee that the filesystem will stay
360	* r/w, just that it is right now. This can not and
361	* should not be used in place of IS_RDONLY(inode).
362	* mnt_want/drop_write() will _keep_ the filesystem
363	* r/w.
364	*/
365	bool __mnt_is_readonly(const struct vfsmount *mnt)
366	{
367	return (mnt->mnt_flags & MNT_READONLY) \|\| sb_rdonly(sb: mnt->mnt_sb);
368	}
369	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
370
371	static inline void mnt_inc_writers(struct mount *mnt)
372	{
373	#ifdef CONFIG_SMP
374	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
375	#else
376	mnt->mnt_writers++;
377	#endif
378	}
379
380	static inline void mnt_dec_writers(struct mount *mnt)
381	{
382	#ifdef CONFIG_SMP
383	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
384	#else
385	mnt->mnt_writers--;
386	#endif
387	}
388
389	static unsigned int mnt_get_writers(struct mount *mnt)
390	{
391	#ifdef CONFIG_SMP
392	unsigned int count = `0`;
393	int cpu;
394
395	for_each_possible_cpu(cpu) {
396	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
397	}
398
399	return count;
400	#else
401	return mnt->mnt_writers;
402	#endif
403	}
404
405	static int mnt_is_readonly(const struct vfsmount *mnt)
406	{
407	if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
408	return `1`;
409	/*
410	* The barrier pairs with the barrier in sb_start_ro_state_change()
411	* making sure if we don't see s_readonly_remount set yet, we also will
412	* not see any superblock / mount flag changes done by remount.
413	* It also pairs with the barrier in sb_end_ro_state_change()
414	* assuring that if we see s_readonly_remount already cleared, we will
415	* see the values of superblock / mount flags updated by remount.
416	*/
417	smp_rmb();
418	return __mnt_is_readonly(mnt);
419	}
420
421	/*
422	* Most r/o & frozen checks on a fs are for operations that take discrete
423	* amounts of time, like a write() or unlink(). We must keep track of when
424	* those operations start (for permission checks) and when they end, so that we
425	* can determine when writes are able to occur to a filesystem.
426	*/
427	/**
428	* mnt_get_write_access - get write access to a mount without freeze protection
429	* @m: the mount on which to take a write
430	*
431	* This tells the low-level filesystem that a write is about to be performed to
432	* it, and makes sure that writes are allowed (mnt it read-write) before
433	* returning success. This operation does not protect against filesystem being
434	* frozen. When the write operation is finished, mnt_put_write_access() must be
435	* called. This is effectively a refcount.
436	*/
437	int mnt_get_write_access(struct vfsmount *m)
438	{
439	struct mount *mnt = real_mount(mnt: m);
440	int ret = `0`;
441
442	preempt_disable();
443	mnt_inc_writers(mnt);
444	/*
445	* The store to mnt_inc_writers must be visible before we pass
446	* WRITE_HOLD loop below, so that the slowpath can see our
447	* incremented count after it has set WRITE_HOLD.
448	*/
449	smp_mb();
450	might_lock(&mount_lock.lock);
451	while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
452	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
453	cpu_relax();
454	} else {
455	/*
456	* This prevents priority inversion, if the task
457	* setting WRITE_HOLD got preempted on a remote
458	* CPU, and it prevents life lock if the task setting
459	* WRITE_HOLD has a lower priority and is bound to
460	* the same CPU as the task that is spinning here.
461	*/
462	preempt_enable();
463	read_seqlock_excl(sl: &mount_lock);
464	read_sequnlock_excl(sl: &mount_lock);
465	preempt_disable();
466	}
467	}
468	/*
469	* The barrier pairs with the barrier sb_start_ro_state_change() making
470	* sure that if we see WRITE_HOLD cleared, we will also see
471	* s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
472	* mnt_is_readonly() and bail in case we are racing with remount
473	* read-only.
474	*/
475	smp_rmb();
476	if (mnt_is_readonly(mnt: m)) {
477	mnt_dec_writers(mnt);
478	ret = -EROFS;
479	}
480	preempt_enable();
481
482	return ret;
483	}
484	EXPORT_SYMBOL_GPL(mnt_get_write_access);
485
486	/**
487	* mnt_want_write - get write access to a mount
488	* @m: the mount on which to take a write
489	*
490	* This tells the low-level filesystem that a write is about to be performed to
491	* it, and makes sure that writes are allowed (mount is read-write, filesystem
492	* is not frozen) before returning success. When the write operation is
493	* finished, mnt_drop_write() must be called. This is effectively a refcount.
494	*/
495	int mnt_want_write(struct vfsmount *m)
496	{
497	int ret;
498
499	sb_start_write(sb: m->mnt_sb);
500	ret = mnt_get_write_access(m);
501	if (ret)
502	sb_end_write(sb: m->mnt_sb);
503	return ret;
504	}
505	EXPORT_SYMBOL_GPL(mnt_want_write);
506
507	/**
508	* mnt_get_write_access_file - get write access to a file's mount
509	* @file: the file who's mount on which to take a write
510	*
511	* This is like mnt_get_write_access, but if @file is already open for write it
512	* skips incrementing mnt_writers (since the open file already has a reference)
513	* and instead only does the check for emergency r/o remounts. This must be
514	* paired with mnt_put_write_access_file.
515	*/
516	int mnt_get_write_access_file(struct file *file)
517	{
518	if (file->f_mode & FMODE_WRITER) {
519	/*
520	* Superblock may have become readonly while there are still
521	* writable fd's, e.g. due to a fs error with errors=remount-ro
522	*/
523	if (__mnt_is_readonly(file->f_path.mnt))
524	return -EROFS;
525	return `0`;
526	}
527	return mnt_get_write_access(file->f_path.mnt);
528	}
529
530	/**
531	* mnt_want_write_file - get write access to a file's mount
532	* @file: the file who's mount on which to take a write
533	*
534	* This is like mnt_want_write, but if the file is already open for writing it
535	* skips incrementing mnt_writers (since the open file already has a reference)
536	* and instead only does the freeze protection and the check for emergency r/o
537	* remounts. This must be paired with mnt_drop_write_file.
538	*/
539	int mnt_want_write_file(struct file *file)
540	{
541	int ret;
542
543	sb_start_write(sb: file_inode(f: file)->i_sb);
544	ret = mnt_get_write_access_file(file);
545	if (ret)
546	sb_end_write(sb: file_inode(f: file)->i_sb);
547	return ret;
548	}
549	EXPORT_SYMBOL_GPL(mnt_want_write_file);
550
551	/**
552	* mnt_put_write_access - give up write access to a mount
553	* @mnt: the mount on which to give up write access
554	*
555	* Tells the low-level filesystem that we are done
556	* performing writes to it. Must be matched with
557	* mnt_get_write_access() call above.
558	*/
559	void mnt_put_write_access(struct vfsmount *mnt)
560	{
561	preempt_disable();
562	mnt_dec_writers(mnt: real_mount(mnt));
563	preempt_enable();
564	}
565	EXPORT_SYMBOL_GPL(mnt_put_write_access);
566
567	/**
568	* mnt_drop_write - give up write access to a mount
569	* @mnt: the mount on which to give up write access
570	*
571	* Tells the low-level filesystem that we are done performing writes to it and
572	* also allows filesystem to be frozen again. Must be matched with
573	* mnt_want_write() call above.
574	*/
575	void mnt_drop_write(struct vfsmount *mnt)
576	{
577	mnt_put_write_access(mnt);
578	sb_end_write(sb: mnt->mnt_sb);
579	}
580	EXPORT_SYMBOL_GPL(mnt_drop_write);
581
582	void mnt_put_write_access_file(struct file *file)
583	{
584	if (!(file->f_mode & FMODE_WRITER))
585	mnt_put_write_access(file->f_path.mnt);
586	}
587
588	void mnt_drop_write_file(struct file *file)
589	{
590	mnt_put_write_access_file(file);
591	sb_end_write(sb: file_inode(f: file)->i_sb);
592	}
593	EXPORT_SYMBOL(mnt_drop_write_file);
594
595	/**
596	* mnt_hold_writers - prevent write access to the given mount
597	* @mnt: mnt to prevent write access to
598	*
599	* Prevents write access to @mnt if there are no active writers for @mnt.
600	* This function needs to be called and return successfully before changing
601	* properties of @mnt that need to remain stable for callers with write access
602	* to @mnt.
603	*
604	* After this functions has been called successfully callers must pair it with
605	* a call to mnt_unhold_writers() in order to stop preventing write access to
606	* @mnt.
607	*
608	* Context: This function expects to be in mount_locked_reader scope serializing
609	* setting WRITE_HOLD.
610	* Return: On success 0 is returned.
611	* On error, -EBUSY is returned.
612	*/
613	static inline int mnt_hold_writers(struct mount *mnt)
614	{
615	set_write_hold(mnt);
616	/*
617	* After storing WRITE_HOLD, we'll read the counters. This store
618	* should be visible before we do.
619	*/
620	smp_mb();
621
622	/*
623	* With writers on hold, if this value is zero, then there are
624	* definitely no active writers (although held writers may subsequently
625	* increment the count, they'll have to wait, and decrement it after
626	* seeing MNT_READONLY).
627	*
628	* It is OK to have counter incremented on one CPU and decremented on
629	* another: the sum will add up correctly. The danger would be when we
630	* sum up each counter, if we read a counter before it is incremented,
631	* but then read another CPU's count which it has been subsequently
632	* decremented from -- we would see more decrements than we should.
633	* WRITE_HOLD protects against this scenario, because
634	* mnt_want_write first increments count, then smp_mb, then spins on
635	* WRITE_HOLD, so it can't be decremented by another CPU while
636	* we're counting up here.
637	*/
638	if (mnt_get_writers(mnt) > `0`)
639	return -EBUSY;
640
641	return `0`;
642	}
643
644	/**
645	* mnt_unhold_writers - stop preventing write access to the given mount
646	* @mnt: mnt to stop preventing write access to
647	*
648	* Stop preventing write access to @mnt allowing callers to gain write access
649	* to @mnt again.
650	*
651	* This function can only be called after a call to mnt_hold_writers().
652	*
653	* Context: This function expects to be in the same mount_locked_reader scope
654	* as the matching mnt_hold_writers().
655	*/
656	static inline void mnt_unhold_writers(struct mount *mnt)
657	{
658	if (!test_write_hold(m: mnt))
659	return;
660	/*
661	* MNT_READONLY must become visible before ~WRITE_HOLD, so writers
662	* that become unheld will see MNT_READONLY.
663	*/
664	smp_wmb();
665	clear_write_hold(m: mnt);
666	}
667
668	static inline void mnt_del_instance(struct mount *m)
669	{
670	struct mount **p = m->mnt_pprev_for_sb;
671	struct mount *next = m->mnt_next_for_sb;
672
673	if (next)
674	next->mnt_pprev_for_sb = p;
675	*p = next;
676	}
677
678	static inline void mnt_add_instance(struct mount m, struct* super_block *s)
679	{
680	struct mount *first = s->s_mounts;
681
682	if (first)
683	first->mnt_pprev_for_sb = &m->mnt_next_for_sb;
684	m->mnt_next_for_sb = first;
685	m->mnt_pprev_for_sb = &s->s_mounts;
686	s->s_mounts = m;
687	}
688
689	static int mnt_make_readonly(struct mount *mnt)
690	{
691	int ret;
692
693	ret = mnt_hold_writers(mnt);
694	if (!ret)
695	mnt->mnt.mnt_flags \|= MNT_READONLY;
696	mnt_unhold_writers(mnt);
697	return ret;
698	}
699
700	int sb_prepare_remount_readonly(struct super_block *sb)
701	{
702	int err = `0`;
703
704	/ Racy optimization. Recheck the counter under WRITE_HOLD /
705	if (atomic_long_read(v: &sb->s_remove_count))
706	return -EBUSY;
707
708	guard(mount_locked_reader)();
709
710	for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
711	if (!(m->mnt.mnt_flags & MNT_READONLY)) {
712	err = mnt_hold_writers(mnt: m);
713	if (err)
714	break;
715	}
716	}
717	if (!err && atomic_long_read(v: &sb->s_remove_count))
718	err = -EBUSY;
719
720	if (!err)
721	sb_start_ro_state_change(sb);
722	for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
723	if (test_write_hold(m))
724	clear_write_hold(m);
725	}
726
727	return err;
728	}
729
730	static void free_vfsmnt(struct mount *mnt)
731	{
732	mnt_idmap_put(idmap: mnt_idmap(mnt: &mnt->mnt));
733	kfree_const(x: mnt->mnt_devname);
734	#ifdef CONFIG_SMP
735	free_percpu(pdata: mnt->mnt_pcp);
736	#endif
737	kmem_cache_free(s: mnt_cache, objp: mnt);
738	}
739
740	static void delayed_free_vfsmnt(struct rcu_head *head)
741	{
742	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
743	}
744
745	/ call under rcu_read_lock /
746	int __legitimize_mnt(struct vfsmount bastard, unsigned* seq)
747	{
748	struct mount *mnt;
749	if (read_seqretry(sl: &mount_lock, start: seq))
750	return `1`;
751	if (bastard == NULL)
752	return `0`;
753	mnt = real_mount(mnt: bastard);
754	mnt_add_count(mnt, n: `1`);
755	smp_mb(); // see mntput_no_expire() and do_umount()
756	if (likely(!read_seqretry(&mount_lock, seq)))
757	return `0`;
758	lock_mount_hash();
759	if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT \| MNT_DOOMED))) {
760	mnt_add_count(mnt, n: -`1`);
761	unlock_mount_hash();
762	return `1`;
763	}
764	unlock_mount_hash();
765	/ caller will mntput() /
766	return -`1`;
767	}
768
769	/ call under rcu_read_lock /
770	static bool legitimize_mnt(struct vfsmount bastard, unsigned* seq)
771	{
772	int res = __legitimize_mnt(bastard, seq);
773	if (likely(!res))
774	return true;
775	if (unlikely(res < `0`)) {
776	rcu_read_unlock();
777	mntput(mnt: bastard);
778	rcu_read_lock();
779	}
780	return false;
781	}
782
783	/**
784	* __lookup_mnt - mount hash lookup
785	* @mnt: parent mount
786	* @dentry: dentry of mountpoint
787	*
788	* If @mnt has a child mount @c mounted on @dentry find and return it.
789	* Caller must either hold the spinlock component of @mount_lock or
790	* hold rcu_read_lock(), sample the seqcount component before the call
791	* and recheck it afterwards.
792	*
793	* Return: The child of @mnt mounted on @dentry or %NULL.
794	*/
795	struct mount __lookup_mnt(struct* vfsmount mnt, struct* dentry *dentry)
796	{
797	struct hlist_head *head = m_hash(mnt, dentry);
798	struct mount *p;
799
800	hlist_for_each_entry_rcu(p, head, mnt_hash)
801	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
802	return p;
803	return NULL;
804	}
805
806	/**
807	* lookup_mnt - Return the child mount mounted at given location
808	* @path: location in the namespace
809	*
810	* Acquires and returns a new reference to mount at given location
811	* or %NULL if nothing is mounted there.
812	*/
813	struct vfsmount lookup_mnt(const* struct path *path)
814	{
815	struct mount *child_mnt;
816	struct vfsmount *m;
817	unsigned seq;
818
819	rcu_read_lock();
820	do {
821	seq = read_seqbegin(sl: &mount_lock);
822	child_mnt = __lookup_mnt(mnt: path->mnt, dentry: path->dentry);
823	m = child_mnt ? &child_mnt->mnt : NULL;
824	} while (!legitimize_mnt(bastard: m, seq));
825	rcu_read_unlock();
826	return m;
827	}
828
829	/*
830	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
831	* current mount namespace.
832	*
833	* The common case is dentries are not mountpoints at all and that
834	* test is handled inline. For the slow case when we are actually
835	* dealing with a mountpoint of some kind, walk through all of the
836	* mounts in the current mount namespace and test to see if the dentry
837	* is a mountpoint.
838	*
839	* The mount_hashtable is not usable in the context because we
840	* need to identify all mounts that may be in the current mount
841	* namespace not just a mount that happens to have some specified
842	* parent mount.
843	*/
844	bool __is_local_mountpoint(const struct dentry *dentry)
845	{
846	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
847	struct mount mnt, n;
848
849	guard(namespace_shared)();
850
851	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node)
852	if (mnt->mnt_mountpoint == dentry)
853	return true;
854
855	return false;
856	}
857
858	struct pinned_mountpoint {
859	struct hlist_node node;
860	struct mountpoint *mp;
861	struct mount *parent;
862	};
863
864	static bool lookup_mountpoint(struct dentry dentry, struct* pinned_mountpoint *m)
865	{
866	struct hlist_head *chain = mp_hash(dentry);
867	struct mountpoint *mp;
868
869	hlist_for_each_entry(mp, chain, m_hash) {
870	if (mp->m_dentry == dentry) {
871	hlist_add_head(n: &m->node, h: &mp->m_list);
872	m->mp = mp;
873	return true;
874	}
875	}
876	return false;
877	}
878
879	static int get_mountpoint(struct dentry dentry, struct* pinned_mountpoint *m)
880	{
881	struct mountpoint *mp __free(kfree) = NULL;
882	bool found;
883	int ret;
884
885	if (d_mountpoint(dentry)) {
886	/ might be worth a WARN_ON() /
887	if (d_unlinked(dentry))
888	return -ENOENT;
889	mountpoint:
890	read_seqlock_excl(sl: &mount_lock);
891	found = lookup_mountpoint(dentry, m);
892	read_sequnlock_excl(sl: &mount_lock);
893	if (found)
894	return `0`;
895	}
896
897	if (!mp)
898	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
899	if (!mp)
900	return -ENOMEM;
901
902	/ Exactly one processes may set d_mounted /
903	ret = d_set_mounted(dentry);
904
905	/ Someone else set d_mounted? /
906	if (ret == -EBUSY)
907	goto mountpoint;
908
909	/ The dentry is not available as a mountpoint? /
910	if (ret)
911	return ret;
912
913	/ Add the new mountpoint to the hash table /
914	read_seqlock_excl(sl: &mount_lock);
915	mp->m_dentry = dget(dentry);
916	hlist_add_head(n: &mp->m_hash, h: mp_hash(dentry));
917	INIT_HLIST_HEAD(&mp->m_list);
918	hlist_add_head(n: &m->node, h: &mp->m_list);
919	m->mp = no_free_ptr(mp);
920	read_sequnlock_excl(sl: &mount_lock);
921	return `0`;
922	}
923
924	/*
925	* vfsmount lock must be held. Additionally, the caller is responsible
926	* for serializing calls for given disposal list.
927	*/
928	static void maybe_free_mountpoint(struct mountpoint mp, struct* list_head *list)
929	{
930	if (hlist_empty(h: &mp->m_list)) {
931	struct dentry *dentry = mp->m_dentry;
932	spin_lock(lock: &dentry->d_lock);
933	dentry->d_flags &= ~DCACHE_MOUNTED;
934	spin_unlock(lock: &dentry->d_lock);
935	dput_to_list(dentry, list);
936	hlist_del(n: &mp->m_hash);
937	kfree(objp: mp);
938	}
939	}
940
941	/*
942	* locks: mount_lock [read_seqlock_excl], namespace_sem [excl]
943	*/
944	static void unpin_mountpoint(struct pinned_mountpoint *m)
945	{
946	if (m->mp) {
947	hlist_del(n: &m->node);
948	maybe_free_mountpoint(mp: m->mp, list: &ex_mountpoints);
949	}
950	}
951
952	static inline int check_mnt(const struct mount *mnt)
953	{
954	return mnt->mnt_ns == current->nsproxy->mnt_ns;
955	}
956
957	static inline bool check_anonymous_mnt(struct mount *mnt)
958	{
959	u64 seq;
960
961	if (!is_anon_ns(ns: mnt->mnt_ns))
962	return false;
963
964	seq = mnt->mnt_ns->seq_origin;
965	return !seq \|\| (seq == current->nsproxy->mnt_ns->ns.ns_id);
966	}
967
968	/*
969	* vfsmount lock must be held for write
970	*/
971	static void touch_mnt_namespace(struct mnt_namespace *ns)
972	{
973	if (ns) {
974	ns->event = ++event;
975	wake_up_interruptible(&ns->poll);
976	}
977	}
978
979	/*
980	* vfsmount lock must be held for write
981	*/
982	static void __touch_mnt_namespace(struct mnt_namespace *ns)
983	{
984	if (ns && ns->event != event) {
985	ns->event = event;
986	wake_up_interruptible(&ns->poll);
987	}
988	}
989
990	/*
991	* locks: mount_lock[write_seqlock]
992	*/
993	static void __umount_mnt(struct mount mnt, struct* list_head *shrink_list)
994	{
995	struct mountpoint *mp;
996	struct mount *parent = mnt->mnt_parent;
997	if (unlikely(parent->overmount == mnt))
998	parent->overmount = NULL;
999	mnt->mnt_parent = mnt;
1000	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1001	list_del_init(entry: &mnt->mnt_child);
1002	hlist_del_init_rcu(n: &mnt->mnt_hash);
1003	hlist_del_init(n: &mnt->mnt_mp_list);
1004	mp = mnt->mnt_mp;
1005	mnt->mnt_mp = NULL;
1006	maybe_free_mountpoint(mp, list: shrink_list);
1007	}
1008
1009	/*
1010	* locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)
1011	*/
1012	static void umount_mnt(struct mount *mnt)
1013	{
1014	__umount_mnt(mnt, shrink_list: &ex_mountpoints);
1015	}
1016
1017	/*
1018	* vfsmount lock must be held for write
1019	*/
1020	void mnt_set_mountpoint(struct mount *mnt,
1021	struct mountpoint *mp,
1022	struct mount *child_mnt)
1023	{
1024	child_mnt->mnt_mountpoint = mp->m_dentry;
1025	child_mnt->mnt_parent = mnt;
1026	child_mnt->mnt_mp = mp;
1027	hlist_add_head(n: &child_mnt->mnt_mp_list, h: &mp->m_list);
1028	}
1029
1030	static void make_visible(struct mount *mnt)
1031	{
1032	struct mount *parent = mnt->mnt_parent;
1033	if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
1034	parent->overmount = mnt;
1035	hlist_add_head_rcu(n: &mnt->mnt_hash,
1036	h: m_hash(mnt: &parent->mnt, dentry: mnt->mnt_mountpoint));
1037	list_add_tail(new: &mnt->mnt_child, head: &parent->mnt_mounts);
1038	}
1039
1040	/**
1041	* attach_mnt - mount a mount, attach to @mount_hashtable and parent's
1042	* list of child mounts
1043	* @parent: the parent
1044	* @mnt: the new mount
1045	* @mp: the new mountpoint
1046	*
1047	* Mount @mnt at @mp on @parent. Then attach @mnt
1048	* to @parent's child mount list and to @mount_hashtable.
1049	*
1050	* Note, when make_visible() is called @mnt->mnt_parent already points
1051	* to the correct parent.
1052	*
1053	* Context: This function expects namespace_lock() and lock_mount_hash()
1054	* to have been acquired in that order.
1055	*/
1056	static void attach_mnt(struct mount mnt, struct* mount *parent,
1057	struct mountpoint *mp)
1058	{
1059	mnt_set_mountpoint(mnt: parent, mp, child_mnt: mnt);
1060	make_visible(mnt);
1061	}
1062
1063	void mnt_change_mountpoint(struct mount parent, struct* mountpoint mp, struct* mount *mnt)
1064	{
1065	struct mountpoint *old_mp = mnt->mnt_mp;
1066
1067	list_del_init(entry: &mnt->mnt_child);
1068	hlist_del_init(n: &mnt->mnt_mp_list);
1069	hlist_del_init_rcu(n: &mnt->mnt_hash);
1070
1071	attach_mnt(mnt, parent, mp);
1072
1073	maybe_free_mountpoint(mp: old_mp, list: &ex_mountpoints);
1074	}
1075
1076	static inline struct mount node_to_mount(struct* rb_node *node)
1077	{
1078	return node ? rb_entry(node, struct mount, mnt_node) : NULL;
1079	}
1080
1081	static void mnt_add_to_ns(struct mnt_namespace ns, struct* mount *mnt)
1082	{
1083	struct rb_node **link = &ns->mounts.rb_node;
1084	struct rb_node *parent = NULL;
1085	bool mnt_first_node = true, mnt_last_node = true;
1086
1087	WARN_ON(mnt_ns_attached(mnt));
1088	mnt->mnt_ns = ns;
1089	while (*link) {
1090	parent = *link;
1091	if (mnt->mnt_id_unique < node_to_mount(node: parent)->mnt_id_unique) {
1092	link = &parent->rb_left;
1093	mnt_last_node = false;
1094	} else {
1095	link = &parent->rb_right;
1096	mnt_first_node = false;
1097	}
1098	}
1099
1100	if (mnt_last_node)
1101	ns->mnt_last_node = &mnt->mnt_node;
1102	if (mnt_first_node)
1103	ns->mnt_first_node = &mnt->mnt_node;
1104	rb_link_node(node: &mnt->mnt_node, parent, rb_link: link);
1105	rb_insert_color(&mnt->mnt_node, &ns->mounts);
1106
1107	mnt_notify_add(m: mnt);
1108	}
1109
1110	static struct mount next_mnt(struct* mount p, struct* mount *root)
1111	{
1112	struct list_head *next = p->mnt_mounts.next;
1113	if (next == &p->mnt_mounts) {
1114	while (`1`) {
1115	if (p == root)
1116	return NULL;
1117	next = p->mnt_child.next;
1118	if (next != &p->mnt_parent->mnt_mounts)
1119	break;
1120	p = p->mnt_parent;
1121	}
1122	}
1123	return list_entry(next, struct mount, mnt_child);
1124	}
1125
1126	static struct mount skip_mnt_tree(struct* mount *p)
1127	{
1128	struct list_head *prev = p->mnt_mounts.prev;
1129	while (prev != &p->mnt_mounts) {
1130	p = list_entry(prev, struct mount, mnt_child);
1131	prev = p->mnt_mounts.prev;
1132	}
1133	return p;
1134	}
1135
1136	/*
1137	* vfsmount lock must be held for write
1138	*/
1139	static void commit_tree(struct mount *mnt)
1140	{
1141	struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;
1142
1143	if (!mnt_ns_attached(mnt)) {
1144	for (struct mount *m = mnt; m; m = next_mnt(p: m, root: mnt))
1145	mnt_add_to_ns(ns: n, mnt: m);
1146	n->nr_mounts += n->pending_mounts;
1147	n->pending_mounts = `0`;
1148	}
1149
1150	make_visible(mnt);
1151	touch_mnt_namespace(ns: n);
1152	}
1153
1154	static void setup_mnt(struct mount m, struct* dentry *root)
1155	{
1156	struct super_block *s = root->d_sb;
1157
1158	atomic_inc(v: &s->s_active);
1159	m->mnt.mnt_sb = s;
1160	m->mnt.mnt_root = dget(dentry: root);
1161	m->mnt_mountpoint = m->mnt.mnt_root;
1162	m->mnt_parent = m;
1163
1164	guard(mount_locked_reader)();
1165	mnt_add_instance(m, s);
1166	}
1167
1168	/**
1169	* vfs_create_mount - Create a mount for a configured superblock
1170	* @fc: The configuration context with the superblock attached
1171	*
1172	* Create a mount to an already configured superblock. If necessary, the
1173	* caller should invoke vfs_get_tree() before calling this.
1174	*
1175	* Note that this does not attach the mount to anything.
1176	*/
1177	struct vfsmount vfs_create_mount(struct* fs_context *fc)
1178	{
1179	struct mount *mnt;
1180
1181	if (!fc->root)
1182	return ERR_PTR(error: -EINVAL);
1183
1184	mnt = alloc_vfsmnt(name: fc->source);
1185	if (!mnt)
1186	return ERR_PTR(error: -ENOMEM);
1187
1188	if (fc->sb_flags & SB_KERNMOUNT)
1189	mnt->mnt.mnt_flags = MNT_INTERNAL;
1190
1191	setup_mnt(m: mnt, root: fc->root);
1192
1193	return &mnt->mnt;
1194	}
1195	EXPORT_SYMBOL(vfs_create_mount);
1196
1197	struct vfsmount fc_mount(struct* fs_context *fc)
1198	{
1199	int err = vfs_get_tree(fc);
1200	if (!err) {
1201	up_write(sem: &fc->root->d_sb->s_umount);
1202	return vfs_create_mount(fc);
1203	}
1204	return ERR_PTR(error: err);
1205	}
1206	EXPORT_SYMBOL(fc_mount);
1207
1208	struct vfsmount fc_mount_longterm(struct* fs_context *fc)
1209	{
1210	struct vfsmount *mnt = fc_mount(fc);
1211	if (!IS_ERR(ptr: mnt))
1212	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
1213	return mnt;
1214	}
1215	EXPORT_SYMBOL(fc_mount_longterm);
1216
1217	struct vfsmount vfs_kern_mount(struct* file_system_type *type,
1218	int flags, const char *name,
1219	void *data)
1220	{
1221	struct fs_context *fc;
1222	struct vfsmount *mnt;
1223	int ret = `0`;
1224
1225	if (!type)
1226	return ERR_PTR(error: -EINVAL);
1227
1228	fc = fs_context_for_mount(fs_type: type, sb_flags: flags);
1229	if (IS_ERR(ptr: fc))
1230	return ERR_CAST(ptr: fc);
1231
1232	if (name)
1233	ret = vfs_parse_fs_string(fc, key: "source", value: name);
1234	if (!ret)
1235	ret = parse_monolithic_mount_data(fc, data);
1236	if (!ret)
1237	mnt = fc_mount(fc);
1238	else
1239	mnt = ERR_PTR(error: ret);
1240
1241	put_fs_context(fc);
1242	return mnt;
1243	}
1244	EXPORT_SYMBOL_GPL(vfs_kern_mount);
1245
1246	static struct mount clone_mnt(struct* mount old, struct* dentry *root,
1247	int flag)
1248	{
1249	struct mount *mnt;
1250	int err;
1251
1252	mnt = alloc_vfsmnt(name: old->mnt_devname);
1253	if (!mnt)
1254	return ERR_PTR(error: -ENOMEM);
1255
1256	mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
1257	~MNT_INTERNAL_FLAGS;
1258
1259	if (flag & (CL_SLAVE \| CL_PRIVATE))
1260	mnt->mnt_group_id = `0`; / not a peer of original /
1261	else
1262	mnt->mnt_group_id = old->mnt_group_id;
1263
1264	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1265	err = mnt_alloc_group_id(mnt);
1266	if (err)
1267	goto out_free;
1268	}
1269
1270	if (mnt->mnt_group_id)
1271	set_mnt_shared(mnt);
1272
1273	mnt->mnt.mnt_idmap = mnt_idmap_get(idmap: mnt_idmap(mnt: &old->mnt));
1274
1275	setup_mnt(m: mnt, root);
1276
1277	if (flag & CL_PRIVATE) // we are done with it
1278	return mnt;
1279
1280	if (peers(m1: mnt, m2: old))
1281	list_add(new: &mnt->mnt_share, head: &old->mnt_share);
1282
1283	if ((flag & CL_SLAVE) && old->mnt_group_id) {
1284	hlist_add_head(n: &mnt->mnt_slave, h: &old->mnt_slave_list);
1285	mnt->mnt_master = old;
1286	} else if (IS_MNT_SLAVE(old)) {
1287	hlist_add_behind(n: &mnt->mnt_slave, prev: &old->mnt_slave);
1288	mnt->mnt_master = old->mnt_master;
1289	}
1290	return mnt;
1291
1292	out_free:
1293	mnt_free_id(mnt);
1294	free_vfsmnt(mnt);
1295	return ERR_PTR(error: err);
1296	}
1297
1298	static void cleanup_mnt(struct mount *mnt)
1299	{
1300	struct hlist_node *p;
1301	struct mount *m;
1302	/*
1303	* The warning here probably indicates that somebody messed
1304	* up a mnt_want/drop_write() pair. If this happens, the
1305	* filesystem was probably unable to make r/w->r/o transitions.
1306	* The locking used to deal with mnt_count decrement provides barriers,
1307	* so mnt_get_writers() below is safe.
1308	*/
1309	WARN_ON(mnt_get_writers(mnt));
1310	if (unlikely(mnt->mnt_pins.first))
1311	mnt_pin_kill(m: mnt);
1312	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1313	hlist_del(n: &m->mnt_umount);
1314	mntput(mnt: &m->mnt);
1315	}
1316	fsnotify_vfsmount_delete(mnt: &mnt->mnt);
1317	dput(mnt->mnt.mnt_root);
1318	deactivate_super(sb: mnt->mnt.mnt_sb);
1319	mnt_free_id(mnt);
1320	call_rcu(head: &mnt->mnt_rcu, func: delayed_free_vfsmnt);
1321	}
1322
1323	static void __cleanup_mnt(struct rcu_head *head)
1324	{
1325	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1326	}
1327
1328	static LLIST_HEAD(delayed_mntput_list);
1329	static void delayed_mntput(struct work_struct *unused)
1330	{
1331	struct llist_node *node = llist_del_all(head: &delayed_mntput_list);
1332	struct mount m, t;
1333
1334	llist_for_each_entry_safe(m, t, node, mnt_llist)
1335	cleanup_mnt(mnt: m);
1336	}
1337	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1338
1339	static void noinline mntput_no_expire_slowpath(struct mount *mnt)
1340	{
1341	LIST_HEAD(list);
1342	int count;
1343
1344	VFS_BUG_ON(mnt->mnt_ns);
1345	lock_mount_hash();
1346	/*
1347	* make sure that if __legitimize_mnt() has not seen us grab
1348	* mount_lock, we'll see their refcount increment here.
1349	*/
1350	smp_mb();
1351	mnt_add_count(mnt, n: -`1`);
1352	count = mnt_get_count(mnt);
1353	if (count != `0`) {
1354	WARN_ON(count < `0`);
1355	rcu_read_unlock();
1356	unlock_mount_hash();
1357	return;
1358	}
1359	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1360	rcu_read_unlock();
1361	unlock_mount_hash();
1362	return;
1363	}
1364	mnt->mnt.mnt_flags \|= MNT_DOOMED;
1365	rcu_read_unlock();
1366
1367	mnt_del_instance(m: mnt);
1368	if (unlikely(!list_empty(&mnt->mnt_expire)))
1369	list_del(entry: &mnt->mnt_expire);
1370
1371	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1372	struct mount p, tmp;
1373	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1374	__umount_mnt(mnt: p, shrink_list: &list);
1375	hlist_add_head(n: &p->mnt_umount, h: &mnt->mnt_stuck_children);
1376	}
1377	}
1378	unlock_mount_hash();
1379	shrink_dentry_list(&list);
1380
1381	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1382	struct task_struct *task = current;
1383	if (likely(!(task->flags & PF_KTHREAD))) {
1384	init_task_work(twork: &mnt->mnt_rcu, func: __cleanup_mnt);
1385	if (!task_work_add(task, twork: &mnt->mnt_rcu, mode: TWA_RESUME))
1386	return;
1387	}
1388	if (llist_add(new: &mnt->mnt_llist, head: &delayed_mntput_list))
1389	schedule_delayed_work(dwork: &delayed_mntput_work, delay: `1`);
1390	return;
1391	}
1392	cleanup_mnt(mnt);
1393	}
1394
1395	static void mntput_no_expire(struct mount *mnt)
1396	{
1397	rcu_read_lock();
1398	if (likely(READ_ONCE(mnt->mnt_ns))) {
1399	/*
1400	* Since we don't do lock_mount_hash() here,
1401	* ->mnt_ns can change under us. However, if it's
1402	* non-NULL, then there's a reference that won't
1403	* be dropped until after an RCU delay done after
1404	* turning ->mnt_ns NULL. So if we observe it
1405	* non-NULL under rcu_read_lock(), the reference
1406	* we are dropping is not the final one.
1407	*/
1408	mnt_add_count(mnt, n: -`1`);
1409	rcu_read_unlock();
1410	return;
1411	}
1412	mntput_no_expire_slowpath(mnt);
1413	}
1414
1415	void mntput(struct vfsmount *mnt)
1416	{
1417	if (mnt) {
1418	struct mount *m = real_mount(mnt);
1419	/ avoid cacheline pingpong /
1420	if (unlikely(m->mnt_expiry_mark))
1421	WRITE_ONCE(m->mnt_expiry_mark, `0`);
1422	mntput_no_expire(mnt: m);
1423	}
1424	}
1425	EXPORT_SYMBOL(mntput);
1426
1427	struct vfsmount mntget(struct* vfsmount *mnt)
1428	{
1429	if (mnt)
1430	mnt_add_count(mnt: real_mount(mnt), n: `1`);
1431	return mnt;
1432	}
1433	EXPORT_SYMBOL(mntget);
1434
1435	/*
1436	* Make a mount point inaccessible to new lookups.
1437	* Because there may still be current users, the caller MUST WAIT
1438	* for an RCU grace period before destroying the mount point.
1439	*/
1440	void mnt_make_shortterm(struct vfsmount *mnt)
1441	{
1442	if (mnt)
1443	real_mount(mnt)->mnt_ns = NULL;
1444	}
1445
1446	/**
1447	* path_is_mountpoint() - Check if path is a mount in the current namespace.
1448	* @path: path to check
1449	*
1450	* d_mountpoint() can only be used reliably to establish if a dentry is
1451	* not mounted in any namespace and that common case is handled inline.
1452	* d_mountpoint() isn't aware of the possibility there may be multiple
1453	* mounts using a given dentry in a different namespace. This function
1454	* checks if the passed in path is a mountpoint rather than the dentry
1455	* alone.
1456	*/
1457	bool path_is_mountpoint(const struct path *path)
1458	{
1459	unsigned seq;
1460	bool res;
1461
1462	if (!d_mountpoint(dentry: path->dentry))
1463	return false;
1464
1465	rcu_read_lock();
1466	do {
1467	seq = read_seqbegin(sl: &mount_lock);
1468	res = __path_is_mountpoint(path);
1469	} while (read_seqretry(sl: &mount_lock, start: seq));
1470	rcu_read_unlock();
1471
1472	return res;
1473	}
1474	EXPORT_SYMBOL(path_is_mountpoint);
1475
1476	struct vfsmount mnt_clone_internal(const* struct path *path)
1477	{
1478	struct mount *p;
1479	p = clone_mnt(old: real_mount(mnt: path->mnt), root: path->dentry, CL_PRIVATE);
1480	if (IS_ERR(ptr: p))
1481	return ERR_CAST(ptr: p);
1482	p->mnt.mnt_flags \|= MNT_INTERNAL;
1483	return &p->mnt;
1484	}
1485
1486	/*
1487	* Returns the mount which either has the specified mnt_id, or has the next
1488	* smallest id afer the specified one.
1489	*/
1490	static struct mount mnt_find_id_at(struct* mnt_namespace *ns, u64 mnt_id)
1491	{
1492	struct rb_node *node = ns->mounts.rb_node;
1493	struct mount *ret = NULL;
1494
1495	while (node) {
1496	struct mount *m = node_to_mount(node);
1497
1498	if (mnt_id <= m->mnt_id_unique) {
1499	ret = node_to_mount(node);
1500	if (mnt_id == m->mnt_id_unique)
1501	break;
1502	node = node->rb_left;
1503	} else {
1504	node = node->rb_right;
1505	}
1506	}
1507	return ret;
1508	}
1509
1510	/*
1511	* Returns the mount which either has the specified mnt_id, or has the next
1512	* greater id before the specified one.
1513	*/
1514	static struct mount mnt_find_id_at_reverse(struct* mnt_namespace *ns, u64 mnt_id)
1515	{
1516	struct rb_node *node = ns->mounts.rb_node;
1517	struct mount *ret = NULL;
1518
1519	while (node) {
1520	struct mount *m = node_to_mount(node);
1521
1522	if (mnt_id >= m->mnt_id_unique) {
1523	ret = node_to_mount(node);
1524	if (mnt_id == m->mnt_id_unique)
1525	break;
1526	node = node->rb_right;
1527	} else {
1528	node = node->rb_left;
1529	}
1530	}
1531	return ret;
1532	}
1533
1534	#ifdef CONFIG_PROC_FS
1535
1536	/ iterator; we want it to have access to namespace_sem, thus here... /
1537	static void m_start(struct* seq_file m, loff_t pos)
1538	{
1539	struct proc_mounts *p = m->private;
1540
1541	down_read(sem: &namespace_sem);
1542
1543	return mnt_find_id_at(ns: p->ns, mnt_id: *pos);
1544	}
1545
1546	static void m_next(struct* seq_file m, void* v, loff_t pos)
1547	{
1548	struct mount next = NULL, mnt = v;
1549	struct rb_node *node = rb_next(&mnt->mnt_node);
1550
1551	++*pos;
1552	if (node) {
1553	next = node_to_mount(node);
1554	*pos = next->mnt_id_unique;
1555	}
1556	return next;
1557	}
1558
1559	static void m_stop(struct seq_file m, void* *v)
1560	{
1561	up_read(sem: &namespace_sem);
1562	}
1563
1564	static int m_show(struct seq_file m, void* *v)
1565	{
1566	struct proc_mounts *p = m->private;
1567	struct mount *r = v;
1568	return p->show(m, &r->mnt);
1569	}
1570
1571	const struct seq_operations mounts_op = {
1572	.start = m_start,
1573	.next = m_next,
1574	.stop = m_stop,
1575	.show = m_show,
1576	};
1577
1578	#endif /* CONFIG_PROC_FS */
1579
1580	/**
1581	* may_umount_tree - check if a mount tree is busy
1582	* @m: root of mount tree
1583	*
1584	* This is called to check if a tree of mounts has any
1585	* open files, pwds, chroots or sub mounts that are
1586	* busy.
1587	*/
1588	int may_umount_tree(struct vfsmount *m)
1589	{
1590	struct mount *mnt = real_mount(mnt: m);
1591	bool busy = false;
1592
1593	/ write lock needed for mnt_get_count /
1594	lock_mount_hash();
1595	for (struct mount *p = mnt; p; p = next_mnt(p, root: mnt)) {
1596	if (mnt_get_count(mnt: p) > (p == mnt ? `2` : `1`)) {
1597	busy = true;
1598	break;
1599	}
1600	}
1601	unlock_mount_hash();
1602
1603	return !busy;
1604	}
1605
1606	EXPORT_SYMBOL(may_umount_tree);
1607
1608	/**
1609	* may_umount - check if a mount point is busy
1610	* @mnt: root of mount
1611	*
1612	* This is called to check if a mount point has any
1613	* open files, pwds, chroots or sub mounts. If the
1614	* mount has sub mounts this will return busy
1615	* regardless of whether the sub mounts are busy.
1616	*
1617	* Doesn't take quota and stuff into account. IOW, in some cases it will
1618	* give false negatives. The main reason why it's here is that we need
1619	* a non-destructive way to look for easily umountable filesystems.
1620	*/
1621	int may_umount(struct vfsmount *mnt)
1622	{
1623	int ret = `1`;
1624	down_read(sem: &namespace_sem);
1625	lock_mount_hash();
1626	if (propagate_mount_busy(real_mount(mnt), `2`))
1627	ret = `0`;
1628	unlock_mount_hash();
1629	up_read(sem: &namespace_sem);
1630	return ret;
1631	}
1632
1633	EXPORT_SYMBOL(may_umount);
1634
1635	#ifdef CONFIG_FSNOTIFY
1636	static void mnt_notify(struct mount *p)
1637	{
1638	if (!p->prev_ns && p->mnt_ns) {
1639	fsnotify_mnt_attach(ns: p->mnt_ns, mnt: &p->mnt);
1640	} else if (p->prev_ns && !p->mnt_ns) {
1641	fsnotify_mnt_detach(ns: p->prev_ns, mnt: &p->mnt);
1642	} else if (p->prev_ns == p->mnt_ns) {
1643	fsnotify_mnt_move(ns: p->mnt_ns, mnt: &p->mnt);
1644	} else {
1645	fsnotify_mnt_detach(ns: p->prev_ns, mnt: &p->mnt);
1646	fsnotify_mnt_attach(ns: p->mnt_ns, mnt: &p->mnt);
1647	}
1648	p->prev_ns = p->mnt_ns;
1649	}
1650
1651	static void notify_mnt_list(void)
1652	{
1653	struct mount m, tmp;
1654	/*
1655	* Notify about mounts that were added/reparented/detached/remain
1656	* connected after unmount.
1657	*/
1658	list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
1659	mnt_notify(p: m);
1660	list_del_init(entry: &m->to_notify);
1661	}
1662	}
1663
1664	static bool need_notify_mnt_list(void)
1665	{
1666	return !list_empty(head: &notify_list);
1667	}
1668	#else
1669	static void notify_mnt_list(void)
1670	{
1671	}
1672
1673	static bool need_notify_mnt_list(void)
1674	{
1675	return false;
1676	}
1677	#endif
1678
1679	static void free_mnt_ns(struct mnt_namespace *);
1680	static void namespace_unlock(void)
1681	{
1682	struct hlist_head head;
1683	struct hlist_node *p;
1684	struct mount *m;
1685	struct mnt_namespace *ns = emptied_ns;
1686	LIST_HEAD(list);
1687
1688	hlist_move_list(old: &unmounted, new: &head);
1689	list_splice_init(list: &ex_mountpoints, head: &list);
1690	emptied_ns = NULL;
1691
1692	if (need_notify_mnt_list()) {
1693	/*
1694	* No point blocking out concurrent readers while notifications
1695	* are sent. This will also allow statmount()/listmount() to run
1696	* concurrently.
1697	*/
1698	downgrade_write(sem: &namespace_sem);
1699	notify_mnt_list();
1700	up_read(sem: &namespace_sem);
1701	} else {
1702	up_write(sem: &namespace_sem);
1703	}
1704	if (unlikely(ns)) {
1705	/ Make sure we notice when we leak mounts. /
1706	VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
1707	free_mnt_ns(ns);
1708	}
1709
1710	shrink_dentry_list(&list);
1711
1712	if (likely(hlist_empty(&head)))
1713	return;
1714
1715	synchronize_rcu_expedited();
1716
1717	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1718	hlist_del(n: &m->mnt_umount);
1719	mntput(&m->mnt);
1720	}
1721	}
1722
1723	static inline void namespace_lock(void)
1724	{
1725	down_write(sem: &namespace_sem);
1726	}
1727
1728	enum umount_tree_flags {
1729	UMOUNT_SYNC = `1`,
1730	UMOUNT_PROPAGATE = `2`,
1731	UMOUNT_CONNECTED = `4`,
1732	};
1733
1734	static bool disconnect_mount(struct mount mnt, enum* umount_tree_flags how)
1735	{
1736	/ Leaving mounts connected is only valid for lazy umounts /
1737	if (how & UMOUNT_SYNC)
1738	return true;
1739
1740	/ A mount without a parent has nothing to be connected to /
1741	if (!mnt_has_parent(mnt))
1742	return true;
1743
1744	/ Because the reference counting rules change when mounts are*
1745	* unmounted and connected, umounted mounts may not be
1746	* connected to mounted mounts.
1747	*/
1748	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1749	return true;
1750
1751	/ Has it been requested that the mount remain connected? /
1752	if (how & UMOUNT_CONNECTED)
1753	return false;
1754
1755	/ Is the mount locked such that it needs to remain connected? /
1756	if (IS_MNT_LOCKED(mnt))
1757	return false;
1758
1759	/ By default disconnect the mount /
1760	return true;
1761	}
1762
1763	/*
1764	* mount_lock must be held
1765	* namespace_sem must be held for write
1766	*/
1767	static void umount_tree(struct mount mnt, enum* umount_tree_flags how)
1768	{
1769	LIST_HEAD(tmp_list);
1770	struct mount *p;
1771
1772	if (how & UMOUNT_PROPAGATE)
1773	propagate_mount_unlock(mnt);
1774
1775	/ Gather the mounts to umount /
1776	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
1777	p->mnt.mnt_flags \|= MNT_UMOUNT;
1778	if (mnt_ns_attached(mnt: p))
1779	move_from_ns(mnt: p);
1780	list_add_tail(new: &p->mnt_list, head: &tmp_list);
1781	}
1782
1783	/ Hide the mounts from mnt_mounts /
1784	list_for_each_entry(p, &tmp_list, mnt_list) {
1785	list_del_init(entry: &p->mnt_child);
1786	}
1787
1788	/ Add propagated mounts to the tmp_list /
1789	if (how & UMOUNT_PROPAGATE)
1790	propagate_umount(&tmp_list);
1791
1792	bulk_make_private(&tmp_list);
1793
1794	while (!list_empty(head: &tmp_list)) {
1795	struct mnt_namespace *ns;
1796	bool disconnect;
1797	p = list_first_entry(&tmp_list, struct mount, mnt_list);
1798	list_del_init(entry: &p->mnt_expire);
1799	list_del_init(entry: &p->mnt_list);
1800	ns = p->mnt_ns;
1801	if (ns) {
1802	ns->nr_mounts--;
1803	__touch_mnt_namespace(ns);
1804	}
1805	p->mnt_ns = NULL;
1806	if (how & UMOUNT_SYNC)
1807	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
1808
1809	disconnect = disconnect_mount(mnt: p, how);
1810	if (mnt_has_parent(mnt: p)) {
1811	if (!disconnect) {
1812	/ Don't forget about p /
1813	list_add_tail(new: &p->mnt_child, head: &p->mnt_parent->mnt_mounts);
1814	} else {
1815	umount_mnt(mnt: p);
1816	}
1817	}
1818	if (disconnect)
1819	hlist_add_head(n: &p->mnt_umount, h: &unmounted);
1820
1821	/*
1822	* At this point p->mnt_ns is NULL, notification will be queued
1823	* only if
1824	*
1825	* - p->prev_ns is non-NULL and
1826	* - p->prev_ns->n_fsnotify_marks is non-NULL
1827	*
1828	* This will preclude queuing the mount if this is a cleanup
1829	* after a failed copy_tree() or destruction of an anonymous
1830	* namespace, etc.
1831	*/
1832	mnt_notify_add(m: p);
1833	}
1834	}
1835
1836	static void shrink_submounts(struct mount *mnt);
1837
1838	static int do_umount_root(struct super_block *sb)
1839	{
1840	int ret = `0`;
1841
1842	down_write(sem: &sb->s_umount);
1843	if (!sb_rdonly(sb)) {
1844	struct fs_context *fc;
1845
1846	fc = fs_context_for_reconfigure(dentry: sb->s_root, SB_RDONLY,
1847	SB_RDONLY);
1848	if (IS_ERR(ptr: fc)) {
1849	ret = PTR_ERR(ptr: fc);
1850	} else {
1851	ret = parse_monolithic_mount_data(fc, NULL);
1852	if (!ret)
1853	ret = reconfigure_super(fc);
1854	put_fs_context(fc);
1855	}
1856	}
1857	up_write(sem: &sb->s_umount);
1858	return ret;
1859	}
1860
1861	static int do_umount(struct mount mnt, int* flags)
1862	{
1863	struct super_block *sb = mnt->mnt.mnt_sb;
1864	int retval;
1865
1866	retval = security_sb_umount(mnt: &mnt->mnt, flags);
1867	if (retval)
1868	return retval;
1869
1870	/*
1871	* Allow userspace to request a mountpoint be expired rather than
1872	* unmounting unconditionally. Unmount only happens if:
1873	* (1) the mark is already set (the mark is cleared by mntput())
1874	* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1875	*/
1876	if (flags & MNT_EXPIRE) {
1877	if (&mnt->mnt == current->fs->root.mnt \|\|
1878	flags & (MNT_FORCE \| MNT_DETACH))
1879	return -EINVAL;
1880
1881	/*
1882	* probably don't strictly need the lock here if we examined
1883	* all race cases, but it's a slowpath.
1884	*/
1885	lock_mount_hash();
1886	if (!list_empty(head: &mnt->mnt_mounts) \|\| mnt_get_count(mnt) != `2`) {
1887	unlock_mount_hash();
1888	return -EBUSY;
1889	}
1890	unlock_mount_hash();
1891
1892	if (!xchg(&mnt->mnt_expiry_mark, `1`))
1893	return -EAGAIN;
1894	}
1895
1896	/*
1897	* If we may have to abort operations to get out of this
1898	* mount, and they will themselves hold resources we must
1899	* allow the fs to do things. In the Unix tradition of
1900	* 'Gee thats tricky lets do it in userspace' the umount_begin
1901	* might fail to complete on the first run through as other tasks
1902	* must return, and the like. Thats for the mount program to worry
1903	* about for the moment.
1904	*/
1905
1906	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1907	sb->s_op->umount_begin(sb);
1908	}
1909
1910	/*
1911	* No sense to grab the lock for this test, but test itself looks
1912	* somewhat bogus. Suggestions for better replacement?
1913	* Ho-hum... In principle, we might treat that as umount + switch
1914	* to rootfs. GC would eventually take care of the old vfsmount.
1915	* Actually it makes sense, especially if rootfs would contain a
1916	* /reboot - static binary that would close all descriptors and
1917	* call reboot(9). Then init(8) could umount root and exec /reboot.
1918	*/
1919	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1920	/*
1921	* Special case for "unmounting" root ...
1922	* we just try to remount it readonly.
1923	*/
1924	if (!ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN))
1925	return -EPERM;
1926	return do_umount_root(sb);
1927	}
1928
1929	namespace_lock();
1930	lock_mount_hash();
1931
1932	/ Repeat the earlier racy checks, now that we are holding the locks /
1933	retval = -EINVAL;
1934	if (!check_mnt(mnt))
1935	goto out;
1936
1937	if (mnt->mnt.mnt_flags & MNT_LOCKED)
1938	goto out;
1939
1940	if (!mnt_has_parent(mnt)) / not the absolute root /
1941	goto out;
1942
1943	event++;
1944	if (flags & MNT_DETACH) {
1945	umount_tree(mnt, how: UMOUNT_PROPAGATE);
1946	retval = `0`;
1947	} else {
1948	smp_mb(); // paired with __legitimize_mnt()
1949	shrink_submounts(mnt);
1950	retval = -EBUSY;
1951	if (!propagate_mount_busy(mnt, `2`)) {
1952	umount_tree(mnt, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
1953	retval = `0`;
1954	}
1955	}
1956	out:
1957	unlock_mount_hash();
1958	namespace_unlock();
1959	return retval;
1960	}
1961
1962	/*
1963	* __detach_mounts - lazily unmount all mounts on the specified dentry
1964	*
1965	* During unlink, rmdir, and d_drop it is possible to loose the path
1966	* to an existing mountpoint, and wind up leaking the mount.
1967	* detach_mounts allows lazily unmounting those mounts instead of
1968	* leaking them.
1969	*
1970	* The caller may hold dentry->d_inode->i_rwsem.
1971	*/
1972	void __detach_mounts(struct dentry *dentry)
1973	{
1974	struct pinned_mountpoint mp = {};
1975	struct mount *mnt;
1976
1977	guard(namespace_excl)();
1978	guard(mount_writer)();
1979
1980	if (!lookup_mountpoint(dentry, m: &mp))
1981	return;
1982
1983	event++;
1984	while (mp.node.next) {
1985	mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);
1986	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1987	umount_mnt(mnt);
1988	hlist_add_head(n: &mnt->mnt_umount, h: &unmounted);
1989	}
1990	else umount_tree(mnt, how: UMOUNT_CONNECTED);
1991	}
1992	unpin_mountpoint(m: &mp);
1993	}
1994
1995	/*
1996	* Is the caller allowed to modify his namespace?
1997	*/
1998	bool may_mount(void)
1999	{
2000	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
2001	}
2002
2003	static void warn_mandlock(void)
2004	{
2005	pr_warn_once("=======================================================\n"
2006	"WARNING: The mand mount option has been deprecated and\n"
2007	" and is ignored by this kernel. Remove the mand\n"
2008	" option from the mount to silence this warning.\n"
2009	"=======================================================\n");
2010	}
2011
2012	static int can_umount(const struct path path, int* flags)
2013	{
2014	struct mount *mnt = real_mount(mnt: path->mnt);
2015	struct super_block *sb = path->dentry->d_sb;
2016
2017	if (!may_mount())
2018	return -EPERM;
2019	if (!path_mounted(path))
2020	return -EINVAL;
2021	if (!check_mnt(mnt))
2022	return -EINVAL;
2023	if (mnt->mnt.mnt_flags & MNT_LOCKED) / Check optimistically /
2024	return -EINVAL;
2025	if (flags & MNT_FORCE && !ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN))
2026	return -EPERM;
2027	return `0`;
2028	}
2029
2030	// caller is responsible for flags being sane
2031	int path_umount(const struct path path, int* flags)
2032	{
2033	struct mount *mnt = real_mount(mnt: path->mnt);
2034	int ret;
2035
2036	ret = can_umount(path, flags);
2037	if (!ret)
2038	ret = do_umount(mnt, flags);
2039
2040	/ we mustn't call path_put() as that would clear mnt_expiry_mark /
2041	dput(path->dentry);
2042	mntput_no_expire(mnt);
2043	return ret;
2044	}
2045
2046	static int ksys_umount(char __user name, int* flags)
2047	{
2048	int lookup_flags = LOOKUP_MOUNTPOINT;
2049	struct path path;
2050	int ret;
2051
2052	// basic validity checks done first
2053	if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
2054	return -EINVAL;
2055
2056	if (!(flags & UMOUNT_NOFOLLOW))
2057	lookup_flags \|= LOOKUP_FOLLOW;
2058	ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
2059	if (ret)
2060	return ret;
2061	return path_umount(path: &path, flags);
2062	}
2063
2064	SYSCALL_DEFINE2(umount, char __user , name, int*, flags)
2065	{
2066	return ksys_umount(name, flags);
2067	}
2068
2069	#ifdef __ARCH_WANT_SYS_OLDUMOUNT
2070
2071	/*
2072	* The 2.0 compatible umount. No flags.
2073	*/
2074	SYSCALL_DEFINE1(oldumount, char __user *, name)
2075	{
2076	return ksys_umount(name, flags: `0`);
2077	}
2078
2079	#endif
2080
2081	static bool is_mnt_ns_file(struct dentry *dentry)
2082	{
2083	struct ns_common *ns;
2084
2085	/ Is this a proxy for a mount namespace? /
2086	if (dentry->d_op != &ns_dentry_operations)
2087	return false;
2088
2089	ns = d_inode(dentry)->i_private;
2090
2091	return ns->ops == &mntns_operations;
2092	}
2093
2094	struct ns_common from_mnt_ns(struct* mnt_namespace *mnt)
2095	{
2096	return &mnt->ns;
2097	}
2098
2099	struct mnt_namespace get_sequential_mnt_ns(struct* mnt_namespace *mntns, bool previous)
2100	{
2101	struct ns_common *ns;
2102
2103	guard(rcu)();
2104
2105	for (;;) {
2106	ns = ns_tree_adjoined_rcu(mntns, previous);
2107	if (IS_ERR(ptr: ns))
2108	return ERR_CAST(ptr: ns);
2109
2110	mntns = to_mnt_ns(ns);
2111
2112	/*
2113	* The last passive reference count is put with RCU
2114	* delay so accessing the mount namespace is not just
2115	* safe but all relevant members are still valid.
2116	*/
2117	if (!ns_capable_noaudit(ns: mntns->user_ns, CAP_SYS_ADMIN))
2118	continue;
2119
2120	/*
2121	* We need an active reference count as we're persisting
2122	* the mount namespace and it might already be on its
2123	* deathbed.
2124	*/
2125	if (!ns_ref_get(mntns))
2126	continue;
2127
2128	return mntns;
2129	}
2130	}
2131
2132	struct mnt_namespace mnt_ns_from_dentry(struct* dentry *dentry)
2133	{
2134	if (!is_mnt_ns_file(dentry))
2135	return NULL;
2136
2137	return to_mnt_ns(get_proc_ns(dentry->d_inode));
2138	}
2139
2140	static bool mnt_ns_loop(struct dentry *dentry)
2141	{
2142	/ Could bind mounting the mount namespace inode cause a*
2143	* mount namespace loop?
2144	*/
2145	struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
2146
2147	if (!mnt_ns)
2148	return false;
2149
2150	return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
2151	}
2152
2153	struct mount copy_tree(struct* mount src_root, struct* dentry *dentry,
2154	int flag)
2155	{
2156	struct mount res, src_parent, src_root_child, src_mnt,
2157	dst_parent, dst_mnt;
2158
2159	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
2160	return ERR_PTR(error: -EINVAL);
2161
2162	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
2163	return ERR_PTR(error: -EINVAL);
2164
2165	res = dst_mnt = clone_mnt(old: src_root, root: dentry, flag);
2166	if (IS_ERR(ptr: dst_mnt))
2167	return dst_mnt;
2168
2169	src_parent = src_root;
2170
2171	list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
2172	if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
2173	continue;
2174
2175	for (src_mnt = src_root_child; src_mnt;
2176	src_mnt = next_mnt(p: src_mnt, root: src_root_child)) {
2177	if (!(flag & CL_COPY_UNBINDABLE) &&
2178	IS_MNT_UNBINDABLE(src_mnt)) {
2179	if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
2180	/ Both unbindable and locked. /
2181	dst_mnt = ERR_PTR(error: -EPERM);
2182	goto out;
2183	} else {
2184	src_mnt = skip_mnt_tree(p: src_mnt);
2185	continue;
2186	}
2187	}
2188	if (!(flag & CL_COPY_MNT_NS_FILE) &&
2189	is_mnt_ns_file(dentry: src_mnt->mnt.mnt_root)) {
2190	src_mnt = skip_mnt_tree(p: src_mnt);
2191	continue;
2192	}
2193	while (src_parent != src_mnt->mnt_parent) {
2194	src_parent = src_parent->mnt_parent;
2195	dst_mnt = dst_mnt->mnt_parent;
2196	}
2197
2198	src_parent = src_mnt;
2199	dst_parent = dst_mnt;
2200	dst_mnt = clone_mnt(old: src_mnt, root: src_mnt->mnt.mnt_root, flag);
2201	if (IS_ERR(ptr: dst_mnt))
2202	goto out;
2203	lock_mount_hash();
2204	if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
2205	dst_mnt->mnt.mnt_flags \|= MNT_LOCKED;
2206	if (unlikely(flag & CL_EXPIRE)) {
2207	/ stick the duplicate mount on the same expiry*
2208	* list as the original if that was on one */
2209	if (!list_empty(head: &src_mnt->mnt_expire))
2210	list_add(new: &dst_mnt->mnt_expire,
2211	head: &src_mnt->mnt_expire);
2212	}
2213	attach_mnt(mnt: dst_mnt, parent: dst_parent, mp: src_parent->mnt_mp);
2214	unlock_mount_hash();
2215	}
2216	}
2217	return res;
2218
2219	out:
2220	if (res) {
2221	lock_mount_hash();
2222	umount_tree(mnt: res, how: UMOUNT_SYNC);
2223	unlock_mount_hash();
2224	}
2225	return dst_mnt;
2226	}
2227
2228	static inline bool extend_array(struct path res, struct path to_free,
2229	unsigned n, unsigned count, unsigned* new_count)
2230	{
2231	struct path *p;
2232
2233	if (likely(n < *count))
2234	return true;
2235	p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL);
2236	if (p && *count)
2237	memcpy(p, res, count * sizeof(struct path));
2238	*count = new_count;
2239	kfree(objp: *to_free);
2240	to_free = res = p;
2241	return p;
2242	}
2243
2244	const struct path collect_paths(const* struct path *path,
2245	struct path prealloc, unsigned* count)
2246	{
2247	struct mount *root = real_mount(mnt: path->mnt);
2248	struct mount *child;
2249	struct path res = prealloc, to_free = NULL;
2250	unsigned n = `0`;
2251
2252	guard(namespace_shared)();
2253
2254	if (!check_mnt(mnt: root))
2255	return ERR_PTR(error: -EINVAL);
2256	if (!extend_array(res: &res, to_free: &to_free, n: `0`, count: &count, new_count: `32`))
2257	return ERR_PTR(error: -ENOMEM);
2258	res[n++] = *path;
2259	list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
2260	if (!is_subdir(child->mnt_mountpoint, path->dentry))
2261	continue;
2262	for (struct mount *m = child; m; m = next_mnt(p: m, root: child)) {
2263	if (!extend_array(res: &res, to_free: &to_free, n, count: &count, new_count: `2` * count))
2264	return ERR_PTR(error: -ENOMEM);
2265	res[n].mnt = &m->mnt;
2266	res[n].dentry = m->mnt.mnt_root;
2267	n++;
2268	}
2269	}
2270	if (!extend_array(res: &res, to_free: &to_free, n, count: &count, new_count: count + `1`))
2271	return ERR_PTR(error: -ENOMEM);
2272	memset(res + n, `0`, (count - n) * sizeof(struct path));
2273	for (struct path *p = res; p->mnt; p++)
2274	path_get(p);
2275	return res;
2276	}
2277
2278	void drop_collected_paths(const struct path paths, const* struct path *prealloc)
2279	{
2280	for (const struct path *p = paths; p->mnt; p++)
2281	path_put(p);
2282	if (paths != prealloc)
2283	kfree(objp: paths);
2284	}
2285
2286	static struct mnt_namespace alloc_mnt_ns(struct* user_namespace *, bool);
2287
2288	void dissolve_on_fput(struct vfsmount *mnt)
2289	{
2290	struct mount *m = real_mount(mnt);
2291
2292	/*
2293	* m used to be the root of anon namespace; if it still is one,
2294	* we need to dissolve the mount tree and free that namespace.
2295	* Let's try to avoid taking namespace_sem if we can determine
2296	* that there's nothing to do without it - rcu_read_lock() is
2297	* enough to make anon_ns_root() memory-safe and once m has
2298	* left its namespace, it's no longer our concern, since it will
2299	* never become a root of anon ns again.
2300	*/
2301
2302	scoped_guard(rcu) {
2303	if (!anon_ns_root(m))
2304	return;
2305	}
2306
2307	scoped_guard(namespace_excl) {
2308	if (!anon_ns_root(m))
2309	return;
2310
2311	emptied_ns = m->mnt_ns;
2312	lock_mount_hash();
2313	umount_tree(mnt: m, how: UMOUNT_CONNECTED);
2314	unlock_mount_hash();
2315	}
2316	}
2317
2318	/ locks: namespace_shared && pinned(mnt) \|\| mount_locked_reader /
2319	static bool __has_locked_children(struct mount mnt, struct* dentry *dentry)
2320	{
2321	struct mount *child;
2322
2323	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2324	if (!is_subdir(child->mnt_mountpoint, dentry))
2325	continue;
2326
2327	if (child->mnt.mnt_flags & MNT_LOCKED)
2328	return true;
2329	}
2330	return false;
2331	}
2332
2333	bool has_locked_children(struct mount mnt, struct* dentry *dentry)
2334	{
2335	guard(mount_locked_reader)();
2336	return __has_locked_children(mnt, dentry);
2337	}
2338
2339	/*
2340	* Check that there aren't references to earlier/same mount namespaces in the
2341	* specified subtree. Such references can act as pins for mount namespaces
2342	* that aren't checked by the mount-cycle checking code, thereby allowing
2343	* cycles to be made.
2344	*
2345	* locks: mount_locked_reader \|\| namespace_shared && pinned(subtree)
2346	*/
2347	static bool check_for_nsfs_mounts(struct mount *subtree)
2348	{
2349	for (struct mount *p = subtree; p; p = next_mnt(p, root: subtree))
2350	if (mnt_ns_loop(dentry: p->mnt.mnt_root))
2351	return false;
2352	return true;
2353	}
2354
2355	/**
2356	* clone_private_mount - create a private clone of a path
2357	* @path: path to clone
2358	*
2359	* This creates a new vfsmount, which will be the clone of @path. The new mount
2360	* will not be attached anywhere in the namespace and will be private (i.e.
2361	* changes to the originating mount won't be propagated into this).
2362	*
2363	* This assumes caller has called or done the equivalent of may_mount().
2364	*
2365	* Release with mntput().
2366	*/
2367	struct vfsmount clone_private_mount(const* struct path *path)
2368	{
2369	struct mount *old_mnt = real_mount(mnt: path->mnt);
2370	struct mount *new_mnt;
2371
2372	guard(namespace_shared)();
2373
2374	if (IS_MNT_UNBINDABLE(old_mnt))
2375	return ERR_PTR(error: -EINVAL);
2376
2377	/*
2378	* Make sure the source mount is acceptable.
2379	* Anything mounted in our mount namespace is allowed.
2380	* Otherwise, it must be the root of an anonymous mount
2381	* namespace, and we need to make sure no namespace
2382	* loops get created.
2383	*/
2384	if (!check_mnt(mnt: old_mnt)) {
2385	if (!anon_ns_root(m: old_mnt))
2386	return ERR_PTR(error: -EINVAL);
2387
2388	if (!check_for_nsfs_mounts(subtree: old_mnt))
2389	return ERR_PTR(error: -EINVAL);
2390	}
2391
2392	if (!ns_capable(ns: old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
2393	return ERR_PTR(error: -EPERM);
2394
2395	if (__has_locked_children(mnt: old_mnt, dentry: path->dentry))
2396	return ERR_PTR(error: -EINVAL);
2397
2398	new_mnt = clone_mnt(old: old_mnt, root: path->dentry, CL_PRIVATE);
2399	if (IS_ERR(ptr: new_mnt))
2400	return ERR_PTR(error: -EINVAL);
2401
2402	/ Longterm mount to be removed by kern_unmount() /*
2403	new_mnt->mnt_ns = MNT_NS_INTERNAL;
2404	return &new_mnt->mnt;
2405	}
2406	EXPORT_SYMBOL_GPL(clone_private_mount);
2407
2408	static void lock_mnt_tree(struct mount *mnt)
2409	{
2410	struct mount *p;
2411
2412	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
2413	int flags = p->mnt.mnt_flags;
2414	/ Don't allow unprivileged users to change mount flags /
2415	flags \|= MNT_LOCK_ATIME;
2416
2417	if (flags & MNT_READONLY)
2418	flags \|= MNT_LOCK_READONLY;
2419
2420	if (flags & MNT_NODEV)
2421	flags \|= MNT_LOCK_NODEV;
2422
2423	if (flags & MNT_NOSUID)
2424	flags \|= MNT_LOCK_NOSUID;
2425
2426	if (flags & MNT_NOEXEC)
2427	flags \|= MNT_LOCK_NOEXEC;
2428	/ Don't allow unprivileged users to reveal what is under a mount /
2429	if (list_empty(head: &p->mnt_expire) && p != mnt)
2430	flags \|= MNT_LOCKED;
2431	p->mnt.mnt_flags = flags;
2432	}
2433	}
2434
2435	static void cleanup_group_ids(struct mount mnt, struct* mount *end)
2436	{
2437	struct mount *p;
2438
2439	for (p = mnt; p != end; p = next_mnt(p, root: mnt)) {
2440	if (p->mnt_group_id && !IS_MNT_SHARED(p))
2441	mnt_release_group_id(mnt: p);
2442	}
2443	}
2444
2445	static int invent_group_ids(struct mount *mnt, bool recurse)
2446	{
2447	struct mount *p;
2448
2449	for (p = mnt; p; p = recurse ? next_mnt(p, root: mnt) : NULL) {
2450	if (!p->mnt_group_id) {
2451	int err = mnt_alloc_group_id(mnt: p);
2452	if (err) {
2453	cleanup_group_ids(mnt, end: p);
2454	return err;
2455	}
2456	}
2457	}
2458
2459	return `0`;
2460	}
2461
2462	int count_mounts(struct mnt_namespace ns, struct* mount *mnt)
2463	{
2464	unsigned int max = READ_ONCE(sysctl_mount_max);
2465	unsigned int mounts = `0`;
2466	struct mount *p;
2467
2468	if (ns->nr_mounts >= max)
2469	return -ENOSPC;
2470	max -= ns->nr_mounts;
2471	if (ns->pending_mounts >= max)
2472	return -ENOSPC;
2473	max -= ns->pending_mounts;
2474
2475	for (p = mnt; p; p = next_mnt(p, root: mnt))
2476	mounts++;
2477
2478	if (mounts > max)
2479	return -ENOSPC;
2480
2481	ns->pending_mounts += mounts;
2482	return `0`;
2483	}
2484
2485	enum mnt_tree_flags_t {
2486	MNT_TREE_BENEATH = BIT(`0`),
2487	MNT_TREE_PROPAGATION = BIT(`1`),
2488	};
2489
2490	/**
2491	* attach_recursive_mnt - attach a source mount tree
2492	* @source_mnt: mount tree to be attached
2493	* @dest: the context for mounting at the place where the tree should go
2494	*
2495	* NOTE: in the table below explains the semantics when a source mount
2496	* of a given type is attached to a destination mount of a given type.
2497	* ---------------------------------------------------------------------------
2498	* \| BIND MOUNT OPERATION \|
2499	* \|**************************************************************************
2500	* \| source-->\| shared \| private \| slave \| unbindable \|
2501	* \| dest \| \| \| \| \|
2502	* \| \| \| \| \| \| \|
2503	* \| v \| \| \| \| \|
2504	* \|**************************************************************************
2505	* \| shared \| shared (++) \| shared (+) \| shared(+++)\| invalid \|
2506	* \| \| \| \| \| \|
2507	* \|non-shared\| shared (+) \| private \| slave (*) \| invalid \|
2508	* ***************************************************************************
2509	* A bind operation clones the source mount and mounts the clone on the
2510	* destination mount.
2511	*
2512	* (++) the cloned mount is propagated to all the mounts in the propagation
2513	* tree of the destination mount and the cloned mount is added to
2514	* the peer group of the source mount.
2515	* (+) the cloned mount is created under the destination mount and is marked
2516	* as shared. The cloned mount is added to the peer group of the source
2517	* mount.
2518	* (+++) the mount is propagated to all the mounts in the propagation tree
2519	* of the destination mount and the cloned mount is made slave
2520	* of the same master as that of the source mount. The cloned mount
2521	* is marked as 'shared and slave'.
2522	* (*) the cloned mount is made a slave of the same master as that of the
2523	* source mount.
2524	*
2525	* ---------------------------------------------------------------------------
2526	* \| MOVE MOUNT OPERATION \|
2527	* \|**************************************************************************
2528	* \| source-->\| shared \| private \| slave \| unbindable \|
2529	* \| dest \| \| \| \| \|
2530	* \| \| \| \| \| \| \|
2531	* \| v \| \| \| \| \|
2532	* \|**************************************************************************
2533	* \| shared \| shared (+) \| shared (+) \| shared(+++) \| invalid \|
2534	* \| \| \| \| \| \|
2535	* \|non-shared\| shared (+) \| private \| slave () \| unbindable \|
2536	* ***************************************************************************
2537	*
2538	* (+) the mount is moved to the destination. And is then propagated to
2539	* all the mounts in the propagation tree of the destination mount.
2540	* (+*) the mount is moved to the destination.
2541	* (+++) the mount is moved to the destination and is then propagated to
2542	* all the mounts belonging to the destination mount's propagation tree.
2543	* the mount is marked as 'shared and slave'.
2544	* (*) the mount continues to be a slave at the new location.
2545	*
2546	* if the source mount is a tree, the operations explained above is
2547	* applied to each mount in the tree.
2548	* Must be called without spinlocks held, since this function can sleep
2549	* in allocations.
2550	*
2551	* Context: The function expects namespace_lock() to be held.
2552	* Return: If @source_mnt was successfully attached 0 is returned.
2553	* Otherwise a negative error code is returned.
2554	*/
2555	static int attach_recursive_mnt(struct mount *source_mnt,
2556	const struct pinned_mountpoint *dest)
2557	{
2558	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2559	struct mount *dest_mnt = dest->parent;
2560	struct mountpoint *dest_mp = dest->mp;
2561	HLIST_HEAD(tree_list);
2562	struct mnt_namespace *ns = dest_mnt->mnt_ns;
2563	struct pinned_mountpoint root = {};
2564	struct mountpoint *shorter = NULL;
2565	struct mount child, p;
2566	struct mount *top;
2567	struct hlist_node *n;
2568	int err = `0`;
2569	bool moving = mnt_has_parent(mnt: source_mnt);
2570
2571	/*
2572	* Preallocate a mountpoint in case the new mounts need to be
2573	* mounted beneath mounts on the same mountpoint.
2574	*/
2575	for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
2576	if (!shorter && is_mnt_ns_file(dentry: top->mnt.mnt_root))
2577	shorter = top->mnt_mp;
2578	}
2579	err = get_mountpoint(dentry: top->mnt.mnt_root, m: &root);
2580	if (err)
2581	return err;
2582
2583	/ Is there space to add these mounts to the mount namespace? /
2584	if (!moving) {
2585	err = count_mounts(ns, mnt: source_mnt);
2586	if (err)
2587	goto out;
2588	}
2589
2590	if (IS_MNT_SHARED(dest_mnt)) {
2591	err = invent_group_ids(mnt: source_mnt, recurse: true);
2592	if (err)
2593	goto out;
2594	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2595	}
2596	lock_mount_hash();
2597	if (err)
2598	goto out_cleanup_ids;
2599
2600	if (IS_MNT_SHARED(dest_mnt)) {
2601	for (p = source_mnt; p; p = next_mnt(p, root: source_mnt))
2602	set_mnt_shared(p);
2603	}
2604
2605	if (moving) {
2606	umount_mnt(mnt: source_mnt);
2607	mnt_notify_add(m: source_mnt);
2608	/ if the mount is moved, it should no longer be expired*
2609	* automatically */
2610	list_del_init(entry: &source_mnt->mnt_expire);
2611	} else {
2612	if (source_mnt->mnt_ns) {
2613	/ move from anon - the caller will destroy /
2614	emptied_ns = source_mnt->mnt_ns;
2615	for (p = source_mnt; p; p = next_mnt(p, root: source_mnt))
2616	move_from_ns(mnt: p);
2617	}
2618	}
2619
2620	mnt_set_mountpoint(mnt: dest_mnt, mp: dest_mp, child_mnt: source_mnt);
2621	/*
2622	* Now the original copy is in the same state as the secondaries -
2623	* its root attached to mountpoint, but not hashed and all mounts
2624	* in it are either in our namespace or in no namespace at all.
2625	* Add the original to the list of copies and deal with the
2626	* rest of work for all of them uniformly.
2627	*/
2628	hlist_add_head(n: &source_mnt->mnt_hash, h: &tree_list);
2629
2630	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2631	struct mount *q;
2632	hlist_del_init(n: &child->mnt_hash);
2633	/ Notice when we are propagating across user namespaces /
2634	if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2635	lock_mnt_tree(mnt: child);
2636	q = __lookup_mnt(mnt: &child->mnt_parent->mnt,
2637	dentry: child->mnt_mountpoint);
2638	commit_tree(mnt: child);
2639	if (q) {
2640	struct mount *r = topmost_overmount(m: child);
2641	struct mountpoint *mp = root.mp;
2642
2643	if (unlikely(shorter) && child != source_mnt)
2644	mp = shorter;
2645	mnt_change_mountpoint(parent: r, mp, mnt: q);
2646	}
2647	}
2648	unpin_mountpoint(m: &root);
2649	unlock_mount_hash();
2650
2651	return `0`;
2652
2653	out_cleanup_ids:
2654	while (!hlist_empty(h: &tree_list)) {
2655	child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2656	child->mnt_parent->mnt_ns->pending_mounts = `0`;
2657	umount_tree(mnt: child, how: UMOUNT_SYNC);
2658	}
2659	unlock_mount_hash();
2660	cleanup_group_ids(mnt: source_mnt, NULL);
2661	out:
2662	ns->pending_mounts = `0`;
2663
2664	read_seqlock_excl(sl: &mount_lock);
2665	unpin_mountpoint(m: &root);
2666	read_sequnlock_excl(sl: &mount_lock);
2667
2668	return err;
2669	}
2670
2671	static inline struct mount where_to_mount(const* struct path *path,
2672	struct dentry **dentry,
2673	bool beneath)
2674	{
2675	struct mount *m;
2676
2677	if (unlikely(beneath)) {
2678	m = topmost_overmount(m: real_mount(mnt: path->mnt));
2679	*dentry = m->mnt_mountpoint;
2680	return m->mnt_parent;
2681	}
2682	m = __lookup_mnt(mnt: path->mnt, dentry: path->dentry);
2683	if (unlikely(m)) {
2684	m = topmost_overmount(m);
2685	*dentry = m->mnt.mnt_root;
2686	return m;
2687	}
2688	*dentry = path->dentry;
2689	return real_mount(mnt: path->mnt);
2690	}
2691
2692	/**
2693	* do_lock_mount - acquire environment for mounting
2694	* @path: target path
2695	* @res: context to set up
2696	* @beneath: whether the intention is to mount beneath @path
2697	*
2698	* To mount something at given location, we need
2699	* namespace_sem locked exclusive
2700	* inode of dentry we are mounting on locked exclusive
2701	* struct mountpoint for that dentry
2702	* struct mount we are mounting on
2703	*
2704	* Results are stored in caller-supplied context (pinned_mountpoint);
2705	* on success we have res->parent and res->mp pointing to parent and
2706	* mountpoint respectively and res->node inserted into the ->m_list
2707	* of the mountpoint, making sure the mountpoint won't disappear.
2708	* On failure we have res->parent set to ERR_PTR(-E...), res->mp
2709	* left NULL, res->node - empty.
2710	* In case of success do_lock_mount returns with locks acquired (in
2711	* proper order - inode lock nests outside of namespace_sem).
2712	*
2713	* Request to mount on overmounted location is treated as "mount on
2714	* top of whatever's overmounting it"; request to mount beneath
2715	* a location - "mount immediately beneath the topmost mount at that
2716	* place".
2717	*
2718	* In all cases the location must not have been unmounted and the
2719	* chosen mountpoint must be allowed to be mounted on. For "beneath"
2720	* case we also require the location to be at the root of a mount
2721	* that has a parent (i.e. is not a root of some namespace).
2722	*/
2723	static void do_lock_mount(const struct path *path,
2724	struct pinned_mountpoint *res,
2725	bool beneath)
2726	{
2727	int err;
2728
2729	if (unlikely(beneath) && !path_mounted(path)) {
2730	res->parent = ERR_PTR(error: -EINVAL);
2731	return;
2732	}
2733
2734	do {
2735	struct dentry dentry, d;
2736	struct mount m, n;
2737
2738	scoped_guard(mount_locked_reader) {
2739	m = where_to_mount(path, dentry: &dentry, beneath);
2740	if (&m->mnt != path->mnt) {
2741	mntget(&m->mnt);
2742	dget(dentry);
2743	}
2744	}
2745
2746	inode_lock(inode: dentry->d_inode);
2747	namespace_lock();
2748
2749	// check if the chain of mounts (if any) has changed.
2750	scoped_guard(mount_locked_reader)
2751	n = where_to_mount(path, dentry: &d, beneath);
2752
2753	if (unlikely(n != m \|\| dentry != d))
2754	err = -EAGAIN; // something moved, retry
2755	else if (unlikely(cant_mount(dentry) \|\| !is_mounted(path->mnt)))
2756	err = -ENOENT; // not to be mounted on
2757	else if (beneath && &m->mnt == path->mnt && !m->overmount)
2758	err = -EINVAL;
2759	else
2760	err = get_mountpoint(dentry, m: res);
2761
2762	if (unlikely(err)) {
2763	res->parent = ERR_PTR(error: err);
2764	namespace_unlock();
2765	inode_unlock(inode: dentry->d_inode);
2766	} else {
2767	res->parent = m;
2768	}
2769	/*
2770	* Drop the temporary references. This is subtle - on success
2771	* we are doing that under namespace_sem, which would normally
2772	* be forbidden. However, in that case we are guaranteed that
2773	* refcounts won't reach zero, since we know that path->mnt
2774	* is mounted and thus all mounts reachable from it are pinned
2775	* and stable, along with their mountpoints and roots.
2776	*/
2777	if (&m->mnt != path->mnt) {
2778	dput(dentry);
2779	mntput(&m->mnt);
2780	}
2781	} while (err == -EAGAIN);
2782	}
2783
2784	static void __unlock_mount(struct pinned_mountpoint *m)
2785	{
2786	inode_unlock(inode: m->mp->m_dentry->d_inode);
2787	read_seqlock_excl(sl: &mount_lock);
2788	unpin_mountpoint(m);
2789	read_sequnlock_excl(sl: &mount_lock);
2790	namespace_unlock();
2791	}
2792
2793	static inline void unlock_mount(struct pinned_mountpoint *m)
2794	{
2795	if (!IS_ERR(ptr: m->parent))
2796	__unlock_mount(m);
2797	}
2798
2799	#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
2800	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2801	do_lock_mount((path), &mp, (beneath))
2802	#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
2803	#define LOCK_MOUNT_EXACT(mp, path) \
2804	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2805	lock_mount_exact((path), &mp)
2806
2807	static int graft_tree(struct mount mnt, const* struct pinned_mountpoint *mp)
2808	{
2809	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2810	return -EINVAL;
2811
2812	if (d_is_dir(dentry: mp->mp->m_dentry) !=
2813	d_is_dir(dentry: mnt->mnt.mnt_root))
2814	return -ENOTDIR;
2815
2816	return attach_recursive_mnt(source_mnt: mnt, dest: mp);
2817	}
2818
2819	static int may_change_propagation(const struct mount *m)
2820	{
2821	struct mnt_namespace *ns = m->mnt_ns;
2822
2823	// it must be mounted in some namespace
2824	if (IS_ERR_OR_NULL(ptr: ns)) // is_mounted()
2825	return -EINVAL;
2826	// and the caller must be admin in userns of that namespace
2827	if (!ns_capable(ns: ns->user_ns, CAP_SYS_ADMIN))
2828	return -EPERM;
2829	return `0`;
2830	}
2831
2832	/*
2833	* Sanity check the flags to change_mnt_propagation.
2834	*/
2835
2836	static int flags_to_propagation_type(int ms_flags)
2837	{
2838	int type = ms_flags & ~(MS_REC \| MS_SILENT);
2839
2840	/ Fail if any non-propagation flags are set /
2841	if (type & ~(MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
2842	return `0`;
2843	/ Only one propagation flag should be set /
2844	if (!is_power_of_2(n: type))
2845	return `0`;
2846	return type;
2847	}
2848
2849	/*
2850	* recursively change the type of the mountpoint.
2851	*/
2852	static int do_change_type(const struct path path, int* ms_flags)
2853	{
2854	struct mount *m;
2855	struct mount *mnt = real_mount(mnt: path->mnt);
2856	int recurse = ms_flags & MS_REC;
2857	int type;
2858	int err;
2859
2860	if (!path_mounted(path))
2861	return -EINVAL;
2862
2863	type = flags_to_propagation_type(ms_flags);
2864	if (!type)
2865	return -EINVAL;
2866
2867	guard(namespace_excl)();
2868
2869	err = may_change_propagation(m: mnt);
2870	if (err)
2871	return err;
2872
2873	if (type == MS_SHARED) {
2874	err = invent_group_ids(mnt, recurse);
2875	if (err)
2876	return err;
2877	}
2878
2879	for (m = mnt; m; m = (recurse ? next_mnt(p: m, root: mnt) : NULL))
2880	change_mnt_propagation(m, type);
2881
2882	return `0`;
2883	}
2884
2885	/ may_copy_tree() - check if a mount tree can be copied*
2886	* @path: path to the mount tree to be copied
2887	*
2888	* This helper checks if the caller may copy the mount tree starting
2889	* from @path->mnt. The caller may copy the mount tree under the
2890	* following circumstances:
2891	*
2892	* (1) The caller is located in the mount namespace of the mount tree.
2893	* This also implies that the mount does not belong to an anonymous
2894	* mount namespace.
2895	* (2) The caller tries to copy an nfs mount referring to a mount
2896	* namespace, i.e., the caller is trying to copy a mount namespace
2897	* entry from nsfs.
2898	* (3) The caller tries to copy a pidfs mount referring to a pidfd.
2899	* (4) The caller is trying to copy a mount tree that belongs to an
2900	* anonymous mount namespace.
2901	*
2902	* For that to be safe, this helper enforces that the origin mount
2903	* namespace the anonymous mount namespace was created from is the
2904	* same as the caller's mount namespace by comparing the sequence
2905	* numbers.
2906	*
2907	* This is not strictly necessary. The current semantics of the new
2908	* mount api enforce that the caller must be located in the same
2909	* mount namespace as the mount tree it interacts with. Using the
2910	* origin sequence number preserves these semantics even for
2911	* anonymous mount namespaces. However, one could envision extending
2912	* the api to directly operate across mount namespace if needed.
2913	*
2914	* The ownership of a non-anonymous mount namespace such as the
2915	* caller's cannot change.
2916	* => We know that the caller's mount namespace is stable.
2917	*
2918	* If the origin sequence number of the anonymous mount namespace is
2919	* the same as the sequence number of the caller's mount namespace.
2920	* => The owning namespaces are the same.
2921	*
2922	* ==> The earlier capability check on the owning namespace of the
2923	* caller's mount namespace ensures that the caller has the
2924	* ability to copy the mount tree.
2925	*
2926	* Returns true if the mount tree can be copied, false otherwise.
2927	*/
2928	static inline bool may_copy_tree(const struct path *path)
2929	{
2930	struct mount *mnt = real_mount(mnt: path->mnt);
2931	const struct dentry_operations *d_op;
2932
2933	if (check_mnt(mnt))
2934	return true;
2935
2936	d_op = path->dentry->d_op;
2937	if (d_op == &ns_dentry_operations)
2938	return true;
2939
2940	if (d_op == &pidfs_dentry_operations)
2941	return true;
2942
2943	if (!is_mounted(mnt: path->mnt))
2944	return false;
2945
2946	return check_anonymous_mnt(mnt);
2947	}
2948
2949
2950	static struct mount __do_loopback(const* struct path old_path, int* recurse)
2951	{
2952	struct mount *old = real_mount(mnt: old_path->mnt);
2953
2954	if (IS_MNT_UNBINDABLE(old))
2955	return ERR_PTR(error: -EINVAL);
2956
2957	if (!may_copy_tree(path: old_path))
2958	return ERR_PTR(error: -EINVAL);
2959
2960	if (!recurse && __has_locked_children(mnt: old, dentry: old_path->dentry))
2961	return ERR_PTR(error: -EINVAL);
2962
2963	if (recurse)
2964	return copy_tree(src_root: old, dentry: old_path->dentry, CL_COPY_MNT_NS_FILE);
2965	else
2966	return clone_mnt(old, root: old_path->dentry, flag: `0`);
2967	}
2968
2969	/*
2970	* do loopback mount.
2971	*/
2972	static int do_loopback(const struct path path, const* char *old_name,
2973	int recurse)
2974	{
2975	struct path old_path __free(path_put) = {};
2976	struct mount *mnt = NULL;
2977	int err;
2978	if (!old_name \|\| !*old_name)
2979	return -EINVAL;
2980	err = kern_path(old_name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &old_path);
2981	if (err)
2982	return err;
2983
2984	if (mnt_ns_loop(dentry: old_path.dentry))
2985	return -EINVAL;
2986
2987	LOCK_MOUNT(mp, path);
2988	if (IS_ERR(ptr: mp.parent))
2989	return PTR_ERR(ptr: mp.parent);
2990
2991	if (!check_mnt(mnt: mp.parent))
2992	return -EINVAL;
2993
2994	mnt = __do_loopback(old_path: &old_path, recurse);
2995	if (IS_ERR(ptr: mnt))
2996	return PTR_ERR(ptr: mnt);
2997
2998	err = graft_tree(mnt, mp: &mp);
2999	if (err) {
3000	lock_mount_hash();
3001	umount_tree(mnt, how: UMOUNT_SYNC);
3002	unlock_mount_hash();
3003	}
3004	return err;
3005	}
3006
3007	static struct mnt_namespace get_detached_copy(const* struct path *path, bool recursive)
3008	{
3009	struct mnt_namespace ns, mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
3010	struct user_namespace *user_ns = mnt_ns->user_ns;
3011	struct mount mnt, p;
3012
3013	ns = alloc_mnt_ns(user_ns, true);
3014	if (IS_ERR(ptr: ns))
3015	return ns;
3016
3017	guard(namespace_excl)();
3018
3019	/*
3020	* Record the sequence number of the source mount namespace.
3021	* This needs to hold namespace_sem to ensure that the mount
3022	* doesn't get attached.
3023	*/
3024	if (is_mounted(mnt: path->mnt)) {
3025	src_mnt_ns = real_mount(mnt: path->mnt)->mnt_ns;
3026	if (is_anon_ns(ns: src_mnt_ns))
3027	ns->seq_origin = src_mnt_ns->seq_origin;
3028	else
3029	ns->seq_origin = src_mnt_ns->ns.ns_id;
3030	}
3031
3032	mnt = __do_loopback(old_path: path, recurse: recursive);
3033	if (IS_ERR(ptr: mnt)) {
3034	emptied_ns = ns;
3035	return ERR_CAST(ptr: mnt);
3036	}
3037
3038	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
3039	mnt_add_to_ns(ns, mnt: p);
3040	ns->nr_mounts++;
3041	}
3042	ns->root = mnt;
3043	return ns;
3044	}
3045
3046	static struct file open_detached_copy(struct* path *path, bool recursive)
3047	{
3048	struct mnt_namespace *ns = get_detached_copy(path, recursive);
3049	struct file *file;
3050
3051	if (IS_ERR(ptr: ns))
3052	return ERR_CAST(ptr: ns);
3053
3054	mntput(path->mnt);
3055	path->mnt = mntget(&ns->root->mnt);
3056	file = dentry_open(path, O_PATH, current_cred());
3057	if (IS_ERR(ptr: file))
3058	dissolve_on_fput(mnt: path->mnt);
3059	else
3060	file->f_mode \|= FMODE_NEED_UNMOUNT;
3061	return file;
3062	}
3063
3064	static struct file vfs_open_tree(int* dfd, const char __user filename, unsigned* int flags)
3065	{
3066	int ret;
3067	struct path path __free(path_put) = {};
3068	int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
3069	bool detached = flags & OPEN_TREE_CLONE;
3070
3071	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
3072
3073	if (flags & ~(AT_EMPTY_PATH \| AT_NO_AUTOMOUNT \| AT_RECURSIVE \|
3074	AT_SYMLINK_NOFOLLOW \| OPEN_TREE_CLONE \|
3075	OPEN_TREE_CLOEXEC))
3076	return ERR_PTR(error: -EINVAL);
3077
3078	if ((flags & (AT_RECURSIVE \| OPEN_TREE_CLONE)) == AT_RECURSIVE)
3079	return ERR_PTR(error: -EINVAL);
3080
3081	if (flags & AT_NO_AUTOMOUNT)
3082	lookup_flags &= ~LOOKUP_AUTOMOUNT;
3083	if (flags & AT_SYMLINK_NOFOLLOW)
3084	lookup_flags &= ~LOOKUP_FOLLOW;
3085	if (flags & AT_EMPTY_PATH)
3086	lookup_flags \|= LOOKUP_EMPTY;
3087
3088	if (detached && !may_mount())
3089	return ERR_PTR(error: -EPERM);
3090
3091	ret = user_path_at(dfd, filename, lookup_flags, &path);
3092	if (unlikely(ret))
3093	return ERR_PTR(error: ret);
3094
3095	if (detached)
3096	return open_detached_copy(path: &path, recursive: flags & AT_RECURSIVE);
3097
3098	return dentry_open(path: &path, O_PATH, current_cred());
3099	}
3100
3101	SYSCALL_DEFINE3(open_tree, int, dfd, const char __user , filename, unsigned*, flags)
3102	{
3103	return FD_ADD(flags, vfs_open_tree(dfd, filename, flags));
3104	}
3105
3106	/*
3107	* Don't allow locked mount flags to be cleared.
3108	*
3109	* No locks need to be held here while testing the various MNT_LOCK
3110	* flags because those flags can never be cleared once they are set.
3111	*/
3112	static bool can_change_locked_flags(struct mount mnt, unsigned* int mnt_flags)
3113	{
3114	unsigned int fl = mnt->mnt.mnt_flags;
3115
3116	if ((fl & MNT_LOCK_READONLY) &&
3117	!(mnt_flags & MNT_READONLY))
3118	return false;
3119
3120	if ((fl & MNT_LOCK_NODEV) &&
3121	!(mnt_flags & MNT_NODEV))
3122	return false;
3123
3124	if ((fl & MNT_LOCK_NOSUID) &&
3125	!(mnt_flags & MNT_NOSUID))
3126	return false;
3127
3128	if ((fl & MNT_LOCK_NOEXEC) &&
3129	!(mnt_flags & MNT_NOEXEC))
3130	return false;
3131
3132	if ((fl & MNT_LOCK_ATIME) &&
3133	((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
3134	return false;
3135
3136	return true;
3137	}
3138
3139	static int change_mount_ro_state(struct mount mnt, unsigned* int mnt_flags)
3140	{
3141	bool readonly_request = (mnt_flags & MNT_READONLY);
3142
3143	if (readonly_request == __mnt_is_readonly(&mnt->mnt))
3144	return `0`;
3145
3146	if (readonly_request)
3147	return mnt_make_readonly(mnt);
3148
3149	mnt->mnt.mnt_flags &= ~MNT_READONLY;
3150	return `0`;
3151	}
3152
3153	static void set_mount_attributes(struct mount mnt, unsigned* int mnt_flags)
3154	{
3155	mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
3156	mnt->mnt.mnt_flags = mnt_flags;
3157	touch_mnt_namespace(ns: mnt->mnt_ns);
3158	}
3159
3160	static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
3161	struct vfsmount *mnt)
3162	{
3163	struct super_block *sb = mnt->mnt_sb;
3164
3165	if (!__mnt_is_readonly(mnt) &&
3166	(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
3167	(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
3168	char buf, mntpath;
3169
3170	buf = (char *)__get_free_page(GFP_KERNEL);
3171	if (buf)
3172	mntpath = d_path(mountpoint, buf, PAGE_SIZE);
3173	else
3174	mntpath = ERR_PTR(error: -ENOMEM);
3175	if (IS_ERR(ptr: mntpath))
3176	mntpath = "(unknown)";
3177
3178	pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
3179	sb->s_type->name,
3180	is_mounted(mnt) ? "remounted" : "mounted",
3181	mntpath, &sb->s_time_max,
3182	(unsigned long long)sb->s_time_max);
3183
3184	sb->s_iflags \|= SB_I_TS_EXPIRY_WARNED;
3185	if (buf)
3186	free_page((unsigned long)buf);
3187	}
3188	}
3189
3190	/*
3191	* Handle reconfiguration of the mountpoint only without alteration of the
3192	* superblock it refers to. This is triggered by specifying MS_REMOUNT\|MS_BIND
3193	* to mount(2).
3194	*/
3195	static int do_reconfigure_mnt(const struct path path, unsigned* int mnt_flags)
3196	{
3197	struct super_block *sb = path->mnt->mnt_sb;
3198	struct mount *mnt = real_mount(mnt: path->mnt);
3199	int ret;
3200
3201	if (!check_mnt(mnt))
3202	return -EINVAL;
3203
3204	if (!path_mounted(path))
3205	return -EINVAL;
3206
3207	if (!can_change_locked_flags(mnt, mnt_flags))
3208	return -EPERM;
3209
3210	/*
3211	* We're only checking whether the superblock is read-only not
3212	* changing it, so only take down_read(&sb->s_umount).
3213	*/
3214	down_read(sem: &sb->s_umount);
3215	lock_mount_hash();
3216	ret = change_mount_ro_state(mnt, mnt_flags);
3217	if (ret == `0`)
3218	set_mount_attributes(mnt, mnt_flags);
3219	unlock_mount_hash();
3220	up_read(sem: &sb->s_umount);
3221
3222	mnt_warn_timestamp_expiry(mountpoint: path, mnt: &mnt->mnt);
3223
3224	return ret;
3225	}
3226
3227	/*
3228	* change filesystem flags. dir should be a physical root of filesystem.
3229	* If you've mounted a non-root directory somewhere and want to do remount
3230	* on it - tough luck.
3231	*/
3232	static int do_remount(const struct path path, int* sb_flags,
3233	int mnt_flags, void *data)
3234	{
3235	int err;
3236	struct super_block *sb = path->mnt->mnt_sb;
3237	struct mount *mnt = real_mount(mnt: path->mnt);
3238	struct fs_context *fc;
3239
3240	if (!check_mnt(mnt))
3241	return -EINVAL;
3242
3243	if (!path_mounted(path))
3244	return -EINVAL;
3245
3246	if (!can_change_locked_flags(mnt, mnt_flags))
3247	return -EPERM;
3248
3249	fc = fs_context_for_reconfigure(dentry: path->dentry, sb_flags, MS_RMT_MASK);
3250	if (IS_ERR(ptr: fc))
3251	return PTR_ERR(ptr: fc);
3252
3253	/*
3254	* Indicate to the filesystem that the remount request is coming
3255	* from the legacy mount system call.
3256	*/
3257	fc->oldapi = true;
3258
3259	err = parse_monolithic_mount_data(fc, data);
3260	if (!err) {
3261	down_write(sem: &sb->s_umount);
3262	err = -EPERM;
3263	if (ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN)) {
3264	err = reconfigure_super(fc);
3265	if (!err) {
3266	lock_mount_hash();
3267	set_mount_attributes(mnt, mnt_flags);
3268	unlock_mount_hash();
3269	}
3270	}
3271	up_write(sem: &sb->s_umount);
3272	}
3273
3274	mnt_warn_timestamp_expiry(mountpoint: path, mnt: &mnt->mnt);
3275
3276	put_fs_context(fc);
3277	return err;
3278	}
3279
3280	static inline int tree_contains_unbindable(struct mount *mnt)
3281	{
3282	struct mount *p;
3283	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
3284	if (IS_MNT_UNBINDABLE(p))
3285	return `1`;
3286	}
3287	return `0`;
3288	}
3289
3290	static int do_set_group(const struct path from_path, const* struct path *to_path)
3291	{
3292	struct mount *from = real_mount(mnt: from_path->mnt);
3293	struct mount *to = real_mount(mnt: to_path->mnt);
3294	int err;
3295
3296	guard(namespace_excl)();
3297
3298	err = may_change_propagation(m: from);
3299	if (err)
3300	return err;
3301	err = may_change_propagation(m: to);
3302	if (err)
3303	return err;
3304
3305	/ To and From paths should be mount roots /
3306	if (!path_mounted(path: from_path))
3307	return -EINVAL;
3308	if (!path_mounted(path: to_path))
3309	return -EINVAL;
3310
3311	/ Setting sharing groups is only allowed across same superblock /
3312	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
3313	return -EINVAL;
3314
3315	/ From mount root should be wider than To mount root /
3316	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
3317	return -EINVAL;
3318
3319	/ From mount should not have locked children in place of To's root /
3320	if (__has_locked_children(mnt: from, dentry: to->mnt.mnt_root))
3321	return -EINVAL;
3322
3323	/ Setting sharing groups is only allowed on private mounts /
3324	if (IS_MNT_SHARED(to) \|\| IS_MNT_SLAVE(to))
3325	return -EINVAL;
3326
3327	/ From should not be private /
3328	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
3329	return -EINVAL;
3330
3331	if (IS_MNT_SLAVE(from)) {
3332	hlist_add_behind(n: &to->mnt_slave, prev: &from->mnt_slave);
3333	to->mnt_master = from->mnt_master;
3334	}
3335
3336	if (IS_MNT_SHARED(from)) {
3337	to->mnt_group_id = from->mnt_group_id;
3338	list_add(new: &to->mnt_share, head: &from->mnt_share);
3339	set_mnt_shared(to);
3340	}
3341	return `0`;
3342	}
3343
3344	/**
3345	* path_overmounted - check if path is overmounted
3346	* @path: path to check
3347	*
3348	* Check if path is overmounted, i.e., if there's a mount on top of
3349	* @path->mnt with @path->dentry as mountpoint.
3350	*
3351	* Context: namespace_sem must be held at least shared.
3352	* MUST NOT be called under lock_mount_hash() (there one should just
3353	* call __lookup_mnt() and check if it returns NULL).
3354	* Return: If path is overmounted true is returned, false if not.
3355	*/
3356	static inline bool path_overmounted(const struct path *path)
3357	{
3358	unsigned seq = read_seqbegin(sl: &mount_lock);
3359	bool no_child;
3360
3361	rcu_read_lock();
3362	no_child = !__lookup_mnt(mnt: path->mnt, dentry: path->dentry);
3363	rcu_read_unlock();
3364	if (need_seqretry(lock: &mount_lock, seq)) {
3365	read_seqlock_excl(sl: &mount_lock);
3366	no_child = !__lookup_mnt(mnt: path->mnt, dentry: path->dentry);
3367	read_sequnlock_excl(sl: &mount_lock);
3368	}
3369	return unlikely(!no_child);
3370	}
3371
3372	/*
3373	* Check if there is a possibly empty chain of descent from p1 to p2.
3374	* Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
3375	*/
3376	static bool mount_is_ancestor(const struct mount p1, const* struct mount *p2)
3377	{
3378	while (p2 != p1 && mnt_has_parent(mnt: p2))
3379	p2 = p2->mnt_parent;
3380	return p2 == p1;
3381	}
3382
3383	/**
3384	* can_move_mount_beneath - check that we can mount beneath the top mount
3385	* @mnt_from: mount we are trying to move
3386	* @mnt_to: mount under which to mount
3387	* @mp: mountpoint of @mnt_to
3388	*
3389	* - Make sure that nothing can be mounted beneath the caller's current
3390	* root or the rootfs of the namespace.
3391	* - Make sure that the caller can unmount the topmost mount ensuring
3392	* that the caller could reveal the underlying mountpoint.
3393	* - Ensure that nothing has been mounted on top of @mnt_from before we
3394	* grabbed @namespace_sem to avoid creating pointless shadow mounts.
3395	* - Prevent mounting beneath a mount if the propagation relationship
3396	* between the source mount, parent mount, and top mount would lead to
3397	* nonsensical mount trees.
3398	*
3399	* Context: This function expects namespace_lock() to be held.
3400	* Return: On success 0, and on error a negative error code is returned.
3401	*/
3402	static int can_move_mount_beneath(const struct mount *mnt_from,
3403	const struct mount *mnt_to,
3404	const struct mountpoint *mp)
3405	{
3406	struct mount *parent_mnt_to = mnt_to->mnt_parent;
3407
3408	if (IS_MNT_LOCKED(mnt_to))
3409	return -EINVAL;
3410
3411	/ Avoid creating shadow mounts during mount propagation. /
3412	if (mnt_from->overmount)
3413	return -EINVAL;
3414
3415	/*
3416	* Mounting beneath the rootfs only makes sense when the
3417	* semantics of pivot_root(".", ".") are used.
3418	*/
3419	if (&mnt_to->mnt == current->fs->root.mnt)
3420	return -EINVAL;
3421	if (parent_mnt_to == current->nsproxy->mnt_ns->root)
3422	return -EINVAL;
3423
3424	if (mount_is_ancestor(p1: mnt_to, p2: mnt_from))
3425	return -EINVAL;
3426
3427	/*
3428	* If the parent mount propagates to the child mount this would
3429	* mean mounting @mnt_from on @mnt_to->mnt_parent and then
3430	* propagating a copy @c of @mnt_from on top of @mnt_to. This
3431	* defeats the whole purpose of mounting beneath another mount.
3432	*/
3433	if (propagation_would_overmount(from: parent_mnt_to, to: mnt_to, mp))
3434	return -EINVAL;
3435
3436	/*
3437	* If @mnt_to->mnt_parent propagates to @mnt_from this would
3438	* mean propagating a copy @c of @mnt_from on top of @mnt_from.
3439	* Afterwards @mnt_from would be mounted on top of
3440	* @mnt_to->mnt_parent and @mnt_to would be unmounted from
3441	* @mnt->mnt_parent and remounted on @mnt_from. But since @c is
3442	* already mounted on @mnt_from, @mnt_to would ultimately be
3443	* remounted on top of @c. Afterwards, @mnt_from would be
3444	* covered by a copy @c of @mnt_from and @c would be covered by
3445	* @mnt_from itself. This defeats the whole purpose of mounting
3446	* @mnt_from beneath @mnt_to.
3447	*/
3448	if (check_mnt(mnt: mnt_from) &&
3449	propagation_would_overmount(from: parent_mnt_to, to: mnt_from, mp))
3450	return -EINVAL;
3451
3452	return `0`;
3453	}
3454
3455	/ may_use_mount() - check if a mount tree can be used*
3456	* @mnt: vfsmount to be used
3457	*
3458	* This helper checks if the caller may use the mount tree starting
3459	* from @path->mnt. The caller may use the mount tree under the
3460	* following circumstances:
3461	*
3462	* (1) The caller is located in the mount namespace of the mount tree.
3463	* This also implies that the mount does not belong to an anonymous
3464	* mount namespace.
3465	* (2) The caller is trying to use a mount tree that belongs to an
3466	* anonymous mount namespace.
3467	*
3468	* For that to be safe, this helper enforces that the origin mount
3469	* namespace the anonymous mount namespace was created from is the
3470	* same as the caller's mount namespace by comparing the sequence
3471	* numbers.
3472	*
3473	* The ownership of a non-anonymous mount namespace such as the
3474	* caller's cannot change.
3475	* => We know that the caller's mount namespace is stable.
3476	*
3477	* If the origin sequence number of the anonymous mount namespace is
3478	* the same as the sequence number of the caller's mount namespace.
3479	* => The owning namespaces are the same.
3480	*
3481	* ==> The earlier capability check on the owning namespace of the
3482	* caller's mount namespace ensures that the caller has the
3483	* ability to use the mount tree.
3484	*
3485	* Returns true if the mount tree can be used, false otherwise.
3486	*/
3487	static inline bool may_use_mount(struct mount *mnt)
3488	{
3489	if (check_mnt(mnt))
3490	return true;
3491
3492	/*
3493	* Make sure that noone unmounted the target path or somehow
3494	* managed to get their hands on something purely kernel
3495	* internal.
3496	*/
3497	if (!is_mounted(mnt: &mnt->mnt))
3498	return false;
3499
3500	return check_anonymous_mnt(mnt);
3501	}
3502
3503	static int do_move_mount(const struct path *old_path,
3504	const struct path *new_path,
3505	enum mnt_tree_flags_t flags)
3506	{
3507	struct mount *old = real_mount(mnt: old_path->mnt);
3508	int err;
3509	bool beneath = flags & MNT_TREE_BENEATH;
3510
3511	if (!path_mounted(path: old_path))
3512	return -EINVAL;
3513
3514	if (d_is_dir(dentry: new_path->dentry) != d_is_dir(dentry: old_path->dentry))
3515	return -EINVAL;
3516
3517	LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath);
3518	if (IS_ERR(ptr: mp.parent))
3519	return PTR_ERR(ptr: mp.parent);
3520
3521	if (check_mnt(mnt: old)) {
3522	/ if the source is in our namespace... /
3523	/ ... it should be detachable from parent /
3524	if (!mnt_has_parent(mnt: old) \|\| IS_MNT_LOCKED(old))
3525	return -EINVAL;
3526	/ ... which should not be shared /
3527	if (IS_MNT_SHARED(old->mnt_parent))
3528	return -EINVAL;
3529	/ ... and the target should be in our namespace /
3530	if (!check_mnt(mnt: mp.parent))
3531	return -EINVAL;
3532	} else {
3533	/*
3534	* otherwise the source must be the root of some anon namespace.
3535	*/
3536	if (!anon_ns_root(m: old))
3537	return -EINVAL;
3538	/*
3539	* Bail out early if the target is within the same namespace -
3540	* subsequent checks would've rejected that, but they lose
3541	* some corner cases if we check it early.
3542	*/
3543	if (old->mnt_ns == mp.parent->mnt_ns)
3544	return -EINVAL;
3545	/*
3546	* Target should be either in our namespace or in an acceptable
3547	* anon namespace, sensu check_anonymous_mnt().
3548	*/
3549	if (!may_use_mount(mnt: mp.parent))
3550	return -EINVAL;
3551	}
3552
3553	if (beneath) {
3554	struct mount *over = real_mount(mnt: new_path->mnt);
3555
3556	if (mp.parent != over->mnt_parent)
3557	over = mp.parent->overmount;
3558	err = can_move_mount_beneath(mnt_from: old, mnt_to: over, mp: mp.mp);
3559	if (err)
3560	return err;
3561	}
3562
3563	/*
3564	* Don't move a mount tree containing unbindable mounts to a destination
3565	* mount which is shared.
3566	*/
3567	if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(mnt: old))
3568	return -EINVAL;
3569	if (!check_for_nsfs_mounts(subtree: old))
3570	return -ELOOP;
3571	if (mount_is_ancestor(p1: old, p2: mp.parent))
3572	return -ELOOP;
3573
3574	return attach_recursive_mnt(source_mnt: old, dest: &mp);
3575	}
3576
3577	static int do_move_mount_old(const struct path path, const* char *old_name)
3578	{
3579	struct path old_path __free(path_put) = {};
3580	int err;
3581
3582	if (!old_name \|\| !*old_name)
3583	return -EINVAL;
3584
3585	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
3586	if (err)
3587	return err;
3588
3589	return do_move_mount(old_path: &old_path, new_path: path, flags: `0`);
3590	}
3591
3592	/*
3593	* add a mount into a namespace's mount tree
3594	*/
3595	static int do_add_mount(struct mount newmnt, const* struct pinned_mountpoint *mp,
3596	int mnt_flags)
3597	{
3598	struct mount *parent = mp->parent;
3599
3600	if (IS_ERR(ptr: parent))
3601	return PTR_ERR(ptr: parent);
3602
3603	mnt_flags &= ~MNT_INTERNAL_FLAGS;
3604
3605	if (unlikely(!check_mnt(parent))) {
3606	/ that's acceptable only for automounts done in private ns /
3607	if (!(mnt_flags & MNT_SHRINKABLE))
3608	return -EINVAL;
3609	/ ... and for those we'd better have mountpoint still alive /
3610	if (!parent->mnt_ns)
3611	return -EINVAL;
3612	}
3613
3614	/ Refuse the same filesystem on the same mount point /
3615	if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb &&
3616	parent->mnt.mnt_root == mp->mp->m_dentry)
3617	return -EBUSY;
3618
3619	if (d_is_symlink(dentry: newmnt->mnt.mnt_root))
3620	return -EINVAL;
3621
3622	newmnt->mnt.mnt_flags = mnt_flags;
3623	return graft_tree(mnt: newmnt, mp);
3624	}
3625
3626	static bool mount_too_revealing(const struct super_block sb, int* *new_mnt_flags);
3627
3628	/*
3629	* Create a new mount using a superblock configuration and request it
3630	* be added to the namespace tree.
3631	*/
3632	static int do_new_mount_fc(struct fs_context fc, const* struct path *mountpoint,
3633	unsigned int mnt_flags)
3634	{
3635	struct super_block *sb;
3636	struct vfsmount *mnt __free(mntput) = fc_mount(fc);
3637	int error;
3638
3639	if (IS_ERR(ptr: mnt))
3640	return PTR_ERR(ptr: mnt);
3641
3642	sb = fc->root->d_sb;
3643	error = security_sb_kern_mount(sb);
3644	if (unlikely(error))
3645	return error;
3646
3647	if (unlikely(mount_too_revealing(sb, &mnt_flags))) {
3648	errorfcp(fc, "VFS", "Mount too revealing");
3649	return -EPERM;
3650	}
3651
3652	mnt_warn_timestamp_expiry(mountpoint, mnt);
3653
3654	LOCK_MOUNT(mp, mountpoint);
3655	error = do_add_mount(newmnt: real_mount(mnt), mp: &mp, mnt_flags);
3656	if (!error)
3657	retain_and_null_ptr(mnt); // consumed on success
3658	return error;
3659	}
3660
3661	/*
3662	* create a new mount for userspace and request it to be added into the
3663	* namespace's tree
3664	*/
3665	static int do_new_mount(const struct path path, const* char *fstype,
3666	int sb_flags, int mnt_flags,
3667	const char name, void* *data)
3668	{
3669	struct file_system_type *type;
3670	struct fs_context *fc;
3671	const char *subtype = NULL;
3672	int err = `0`;
3673
3674	if (!fstype)
3675	return -EINVAL;
3676
3677	type = get_fs_type(name: fstype);
3678	if (!type)
3679	return -ENODEV;
3680
3681	if (type->fs_flags & FS_HAS_SUBTYPE) {
3682	subtype = strchr(fstype, `'.'`);
3683	if (subtype) {
3684	subtype++;
3685	if (!*subtype) {
3686	put_filesystem(fs: type);
3687	return -EINVAL;
3688	}
3689	}
3690	}
3691
3692	fc = fs_context_for_mount(fs_type: type, sb_flags);
3693	put_filesystem(fs: type);
3694	if (IS_ERR(ptr: fc))
3695	return PTR_ERR(ptr: fc);
3696
3697	/*
3698	* Indicate to the filesystem that the mount request is coming
3699	* from the legacy mount system call.
3700	*/
3701	fc->oldapi = true;
3702
3703	if (subtype)
3704	err = vfs_parse_fs_string(fc, key: "subtype", value: subtype);
3705	if (!err && name)
3706	err = vfs_parse_fs_string(fc, key: "source", value: name);
3707	if (!err)
3708	err = parse_monolithic_mount_data(fc, data);
3709	if (!err && !mount_capable(fc))
3710	err = -EPERM;
3711	if (!err)
3712	err = do_new_mount_fc(fc, mountpoint: path, mnt_flags);
3713
3714	put_fs_context(fc);
3715	return err;
3716	}
3717
3718	static void lock_mount_exact(const struct path *path,
3719	struct pinned_mountpoint *mp)
3720	{
3721	struct dentry *dentry = path->dentry;
3722	int err;
3723
3724	inode_lock(inode: dentry->d_inode);
3725	namespace_lock();
3726	if (unlikely(cant_mount(dentry)))
3727	err = -ENOENT;
3728	else if (path_overmounted(path))
3729	err = -EBUSY;
3730	else
3731	err = get_mountpoint(dentry, m: mp);
3732	if (unlikely(err)) {
3733	namespace_unlock();
3734	inode_unlock(inode: dentry->d_inode);
3735	mp->parent = ERR_PTR(error: err);
3736	} else {
3737	mp->parent = real_mount(mnt: path->mnt);
3738	}
3739	}
3740
3741	int finish_automount(struct vfsmount __m, const* struct path *path)
3742	{
3743	struct vfsmount *m __free(mntput) = __m;
3744	struct mount *mnt;
3745	int err;
3746
3747	if (!m)
3748	return `0`;
3749	if (IS_ERR(ptr: m))
3750	return PTR_ERR(ptr: m);
3751
3752	mnt = real_mount(mnt: m);
3753
3754	if (m->mnt_root == path->dentry)
3755	return -ELOOP;
3756
3757	/*
3758	* we don't want to use LOCK_MOUNT() - in this case finding something
3759	* that overmounts our mountpoint to be means "quitely drop what we've
3760	* got", not "try to mount it on top".
3761	*/
3762	LOCK_MOUNT_EXACT(mp, path);
3763	if (mp.parent == ERR_PTR(error: -EBUSY))
3764	return `0`;
3765
3766	err = do_add_mount(newmnt: mnt, mp: &mp, mnt_flags: path->mnt->mnt_flags \| MNT_SHRINKABLE);
3767	if (likely(!err))
3768	retain_and_null_ptr(m);
3769	return err;
3770	}
3771
3772	/**
3773	* mnt_set_expiry - Put a mount on an expiration list
3774	* @mnt: The mount to list.
3775	* @expiry_list: The list to add the mount to.
3776	*/
3777	void mnt_set_expiry(struct vfsmount mnt, struct* list_head *expiry_list)
3778	{
3779	guard(mount_locked_reader)();
3780	list_add_tail(new: &real_mount(mnt)->mnt_expire, head: expiry_list);
3781	}
3782	EXPORT_SYMBOL(mnt_set_expiry);
3783
3784	/*
3785	* process a list of expirable mountpoints with the intent of discarding any
3786	* mountpoints that aren't in use and haven't been touched since last we came
3787	* here
3788	*/
3789	void mark_mounts_for_expiry(struct list_head *mounts)
3790	{
3791	struct mount mnt, next;
3792	LIST_HEAD(graveyard);
3793
3794	if (list_empty(head: mounts))
3795	return;
3796
3797	guard(namespace_excl)();
3798	guard(mount_writer)();
3799
3800	/ extract from the expiration list every vfsmount that matches the*
3801	* following criteria:
3802	* - already mounted
3803	* - only referenced by its parent vfsmount
3804	* - still marked for expiry (marked on the last call here; marks are
3805	* cleared by mntput())
3806	*/
3807	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3808	if (!is_mounted(mnt: &mnt->mnt))
3809	continue;
3810	if (!xchg(&mnt->mnt_expiry_mark, `1`) \|\|
3811	propagate_mount_busy(mnt, `1`))
3812	continue;
3813	list_move(list: &mnt->mnt_expire, head: &graveyard);
3814	}
3815	while (!list_empty(head: &graveyard)) {
3816	mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3817	touch_mnt_namespace(ns: mnt->mnt_ns);
3818	umount_tree(mnt, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
3819	}
3820	}
3821
3822	EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
3823
3824	/*
3825	* Ripoff of 'select_parent()'
3826	*
3827	* search the list of submounts for a given mountpoint, and move any
3828	* shrinkable submounts to the 'graveyard' list.
3829	*/
3830	static int select_submounts(struct mount parent, struct* list_head *graveyard)
3831	{
3832	struct mount *this_parent = parent;
3833	struct list_head *next;
3834	int found = `0`;
3835
3836	repeat:
3837	next = this_parent->mnt_mounts.next;
3838	resume:
3839	while (next != &this_parent->mnt_mounts) {
3840	struct list_head *tmp = next;
3841	struct mount mnt = list_entry(tmp, struct* mount, mnt_child);
3842
3843	next = tmp->next;
3844	if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
3845	continue;
3846	/*
3847	* Descend a level if the d_mounts list is non-empty.
3848	*/
3849	if (!list_empty(head: &mnt->mnt_mounts)) {
3850	this_parent = mnt;
3851	goto repeat;
3852	}
3853
3854	if (!propagate_mount_busy(mnt, `1`)) {
3855	list_move_tail(list: &mnt->mnt_expire, head: graveyard);
3856	found++;
3857	}
3858	}
3859	/*
3860	* All done at this level ... ascend and resume the search
3861	*/
3862	if (this_parent != parent) {
3863	next = this_parent->mnt_child.next;
3864	this_parent = this_parent->mnt_parent;
3865	goto resume;
3866	}
3867	return found;
3868	}
3869
3870	/*
3871	* process a list of expirable mountpoints with the intent of discarding any
3872	* submounts of a specific parent mountpoint
3873	*
3874	* mount_lock must be held for write
3875	*/
3876	static void shrink_submounts(struct mount *mnt)
3877	{
3878	LIST_HEAD(graveyard);
3879	struct mount *m;
3880
3881	/ extract submounts of 'mountpoint' from the expiration list /
3882	while (select_submounts(parent: mnt, graveyard: &graveyard)) {
3883	while (!list_empty(head: &graveyard)) {
3884	m = list_first_entry(&graveyard, struct mount,
3885	mnt_expire);
3886	touch_mnt_namespace(ns: m->mnt_ns);
3887	umount_tree(mnt: m, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
3888	}
3889	}
3890	}
3891
3892	static void copy_mount_options(const* void __user * data)
3893	{
3894	char *copy;
3895	unsigned left, offset;
3896
3897	if (!data)
3898	return NULL;
3899
3900	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
3901	if (!copy)
3902	return ERR_PTR(error: -ENOMEM);
3903
3904	left = copy_from_user(to: copy, from: data, PAGE_SIZE);
3905
3906	/*
3907	* Not all architectures have an exact copy_from_user(). Resort to
3908	* byte at a time.
3909	*/
3910	offset = PAGE_SIZE - left;
3911	while (left) {
3912	char c;
3913	if (get_user(c, (const char __user *)data + offset))
3914	break;
3915	copy[offset] = c;
3916	left--;
3917	offset++;
3918	}
3919
3920	if (left == PAGE_SIZE) {
3921	kfree(objp: copy);
3922	return ERR_PTR(error: -EFAULT);
3923	}
3924
3925	return copy;
3926	}
3927
3928	static char copy_mount_string(const* void __user *data)
3929	{
3930	return data ? strndup_user(data, PATH_MAX) : NULL;
3931	}
3932
3933	/*
3934	* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
3935	* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
3936	*
3937	* data is a (void *) that can point to any structure up to
3938	* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
3939	* information (or be NULL).
3940	*
3941	* Pre-0.97 versions of mount() didn't have a flags word.
3942	* When the flags word was introduced its top half was required
3943	* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
3944	* Therefore, if this magic number is present, it carries no information
3945	* and must be discarded.
3946	*/
3947	int path_mount(const char dev_name, const* struct path *path,
3948	const char type_page, unsigned* long flags, void *data_page)
3949	{
3950	unsigned int mnt_flags = `0`, sb_flags;
3951	int ret;
3952
3953	/ Discard magic /
3954	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3955	flags &= ~MS_MGC_MSK;
3956
3957	/ Basic sanity checks /
3958	if (data_page)
3959	((char *)data_page)[PAGE_SIZE - `1`] = `0`;
3960
3961	if (flags & MS_NOUSER)
3962	return -EINVAL;
3963
3964	ret = security_sb_mount(dev_name, path, type: type_page, flags, data: data_page);
3965	if (ret)
3966	return ret;
3967	if (!may_mount())
3968	return -EPERM;
3969	if (flags & SB_MANDLOCK)
3970	warn_mandlock();
3971
3972	/ Default to relatime unless overriden /
3973	if (!(flags & MS_NOATIME))
3974	mnt_flags \|= MNT_RELATIME;
3975
3976	/ Separate the per-mountpoint flags /
3977	if (flags & MS_NOSUID)
3978	mnt_flags \|= MNT_NOSUID;
3979	if (flags & MS_NODEV)
3980	mnt_flags \|= MNT_NODEV;
3981	if (flags & MS_NOEXEC)
3982	mnt_flags \|= MNT_NOEXEC;
3983	if (flags & MS_NOATIME)
3984	mnt_flags \|= MNT_NOATIME;
3985	if (flags & MS_NODIRATIME)
3986	mnt_flags \|= MNT_NODIRATIME;
3987	if (flags & MS_STRICTATIME)
3988	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
3989	if (flags & MS_RDONLY)
3990	mnt_flags \|= MNT_READONLY;
3991	if (flags & MS_NOSYMFOLLOW)
3992	mnt_flags \|= MNT_NOSYMFOLLOW;
3993
3994	/ The default atime for remount is preservation /
3995	if ((flags & MS_REMOUNT) &&
3996	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
3997	MS_STRICTATIME)) == `0`)) {
3998	mnt_flags &= ~MNT_ATIME_MASK;
3999	mnt_flags \|= path->mnt->mnt_flags & MNT_ATIME_MASK;
4000	}
4001
4002	sb_flags = flags & (SB_RDONLY \|
4003	SB_SYNCHRONOUS \|
4004	SB_MANDLOCK \|
4005	SB_DIRSYNC \|
4006	SB_SILENT \|
4007	SB_POSIXACL \|
4008	SB_LAZYTIME \|
4009	SB_I_VERSION);
4010
4011	if ((flags & (MS_REMOUNT \| MS_BIND)) == (MS_REMOUNT \| MS_BIND))
4012	return do_reconfigure_mnt(path, mnt_flags);
4013	if (flags & MS_REMOUNT)
4014	return do_remount(path, sb_flags, mnt_flags, data: data_page);
4015	if (flags & MS_BIND)
4016	return do_loopback(path, old_name: dev_name, recurse: flags & MS_REC);
4017	if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
4018	return do_change_type(path, ms_flags: flags);
4019	if (flags & MS_MOVE)
4020	return do_move_mount_old(path, old_name: dev_name);
4021
4022	return do_new_mount(path, fstype: type_page, sb_flags, mnt_flags, name: dev_name,
4023	data: data_page);
4024	}
4025
4026	int do_mount(const char dev_name, const* char __user *dir_name,
4027	const char type_page, unsigned* long flags, void *data_page)
4028	{
4029	struct path path __free(path_put) = {};
4030	int ret;
4031
4032	ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
4033	if (ret)
4034	return ret;
4035	return path_mount(dev_name, path: &path, type_page, flags, data_page);
4036	}
4037
4038	static struct ucounts inc_mnt_namespaces(struct* user_namespace *ns)
4039	{
4040	return inc_ucount(ns, current_euid(), type: UCOUNT_MNT_NAMESPACES);
4041	}
4042
4043	static void dec_mnt_namespaces(struct ucounts *ucounts)
4044	{
4045	dec_ucount(ucounts, type: UCOUNT_MNT_NAMESPACES);
4046	}
4047
4048	static void free_mnt_ns(struct mnt_namespace *ns)
4049	{
4050	if (!is_anon_ns(ns))
4051	ns_common_free(ns);
4052	dec_mnt_namespaces(ucounts: ns->ucounts);
4053	mnt_ns_tree_remove(ns);
4054	}
4055
4056	static struct mnt_namespace alloc_mnt_ns(struct* user_namespace *user_ns, bool anon)
4057	{
4058	struct mnt_namespace *new_ns;
4059	struct ucounts *ucounts;
4060	int ret;
4061
4062	ucounts = inc_mnt_namespaces(ns: user_ns);
4063	if (!ucounts)
4064	return ERR_PTR(error: -ENOSPC);
4065
4066	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
4067	if (!new_ns) {
4068	dec_mnt_namespaces(ucounts);
4069	return ERR_PTR(error: -ENOMEM);
4070	}
4071
4072	if (anon)
4073	ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
4074	else
4075	ret = ns_common_init(new_ns);
4076	if (ret) {
4077	kfree(objp: new_ns);
4078	dec_mnt_namespaces(ucounts);
4079	return ERR_PTR(error: ret);
4080	}
4081	ns_tree_gen_id(new_ns);
4082
4083	new_ns->is_anon = anon;
4084	refcount_set(r: &new_ns->passive, n: `1`);
4085	new_ns->mounts = RB_ROOT;
4086	init_waitqueue_head(&new_ns->poll);
4087	new_ns->user_ns = get_user_ns(ns: user_ns);
4088	new_ns->ucounts = ucounts;
4089	return new_ns;
4090	}
4091
4092	__latent_entropy
4093	struct mnt_namespace copy_mnt_ns(u64 flags, struct* mnt_namespace *ns,
4094	struct user_namespace user_ns, struct* fs_struct *new_fs)
4095	{
4096	struct mnt_namespace *new_ns;
4097	struct vfsmount *rootmnt __free(mntput) = NULL;
4098	struct vfsmount *pwdmnt __free(mntput) = NULL;
4099	struct mount p, q;
4100	struct mount *old;
4101	struct mount *new;
4102	int copy_flags;
4103
4104	BUG_ON(!ns);
4105
4106	if (likely(!(flags & CLONE_NEWNS))) {
4107	get_mnt_ns(ns);
4108	return ns;
4109	}
4110
4111	old = ns->root;
4112
4113	new_ns = alloc_mnt_ns(user_ns, anon: false);
4114	if (IS_ERR(ptr: new_ns))
4115	return new_ns;
4116
4117	guard(namespace_excl)();
4118	/ First pass: copy the tree topology /
4119	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
4120	if (user_ns != ns->user_ns)
4121	copy_flags \|= CL_SLAVE;
4122	new = copy_tree(src_root: old, dentry: old->mnt.mnt_root, flag: copy_flags);
4123	if (IS_ERR(ptr: new)) {
4124	emptied_ns = new_ns;
4125	return ERR_CAST(ptr: new);
4126	}
4127	if (user_ns != ns->user_ns) {
4128	guard(mount_writer)();
4129	lock_mnt_tree(mnt: new);
4130	}
4131	new_ns->root = new;
4132
4133	/*
4134	* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
4135	* as belonging to new namespace. We have already acquired a private
4136	* fs_struct, so tsk->fs->lock is not needed.
4137	*/
4138	p = old;
4139	q = new;
4140	while (p) {
4141	mnt_add_to_ns(ns: new_ns, mnt: q);
4142	new_ns->nr_mounts++;
4143	if (new_fs) {
4144	if (&p->mnt == new_fs->root.mnt) {
4145	new_fs->root.mnt = mntget(&q->mnt);
4146	rootmnt = &p->mnt;
4147	}
4148	if (&p->mnt == new_fs->pwd.mnt) {
4149	new_fs->pwd.mnt = mntget(&q->mnt);
4150	pwdmnt = &p->mnt;
4151	}
4152	}
4153	p = next_mnt(p, root: old);
4154	q = next_mnt(p: q, root: new);
4155	if (!q)
4156	break;
4157	// an mntns binding we'd skipped?
4158	while (p->mnt.mnt_root != q->mnt.mnt_root)
4159	p = next_mnt(p: skip_mnt_tree(p), root: old);
4160	}
4161	ns_tree_add_raw(new_ns);
4162	return new_ns;
4163	}
4164
4165	struct dentry mount_subtree(struct* vfsmount m, const* char *name)
4166	{
4167	struct mount *mnt = real_mount(mnt: m);
4168	struct mnt_namespace *ns;
4169	struct super_block *s;
4170	struct path path;
4171	int err;
4172
4173	ns = alloc_mnt_ns(user_ns: &init_user_ns, anon: true);
4174	if (IS_ERR(ptr: ns)) {
4175	mntput(m);
4176	return ERR_CAST(ptr: ns);
4177	}
4178	ns->root = mnt;
4179	ns->nr_mounts++;
4180	mnt_add_to_ns(ns, mnt);
4181
4182	err = vfs_path_lookup(m->mnt_root, m,
4183	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
4184
4185	put_mnt_ns(ns);
4186
4187	if (err)
4188	return ERR_PTR(error: err);
4189
4190	/ trade a vfsmount reference for active sb one /
4191	s = path.mnt->mnt_sb;
4192	atomic_inc(v: &s->s_active);
4193	mntput(path.mnt);
4194	/ lock the sucker /
4195	down_write(sem: &s->s_umount);
4196	/ ... and return the root of (sub)tree on it /
4197	return path.dentry;
4198	}
4199	EXPORT_SYMBOL(mount_subtree);
4200
4201	SYSCALL_DEFINE5(mount, char __user , dev_name, char* __user *, dir_name,
4202	char __user , type, unsigned* long, flags, void __user *, data)
4203	{
4204	int ret;
4205	char *kernel_type;
4206	char *kernel_dev;
4207	void *options;
4208
4209	kernel_type = copy_mount_string(data: type);
4210	ret = PTR_ERR(ptr: kernel_type);
4211	if (IS_ERR(ptr: kernel_type))
4212	goto out_type;
4213
4214	kernel_dev = copy_mount_string(data: dev_name);
4215	ret = PTR_ERR(ptr: kernel_dev);
4216	if (IS_ERR(ptr: kernel_dev))
4217	goto out_dev;
4218
4219	options = copy_mount_options(data);
4220	ret = PTR_ERR(ptr: options);
4221	if (IS_ERR(ptr: options))
4222	goto out_data;
4223
4224	ret = do_mount(dev_name: kernel_dev, dir_name, type_page: kernel_type, flags, data_page: options);
4225
4226	kfree(objp: options);
4227	out_data:
4228	kfree(objp: kernel_dev);
4229	out_dev:
4230	kfree(objp: kernel_type);
4231	out_type:
4232	return ret;
4233	}
4234
4235	#define FSMOUNT_VALID_FLAGS \
4236	(MOUNT_ATTR_RDONLY \| MOUNT_ATTR_NOSUID \| MOUNT_ATTR_NODEV \| \
4237	MOUNT_ATTR_NOEXEC \| MOUNT_ATTR__ATIME \| MOUNT_ATTR_NODIRATIME \| \
4238	MOUNT_ATTR_NOSYMFOLLOW)
4239
4240	#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS \| MOUNT_ATTR_IDMAP)
4241
4242	#define MOUNT_SETATTR_PROPAGATION_FLAGS \
4243	(MS_UNBINDABLE \| MS_PRIVATE \| MS_SLAVE \| MS_SHARED)
4244
4245	static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
4246	{
4247	unsigned int mnt_flags = `0`;
4248
4249	if (attr_flags & MOUNT_ATTR_RDONLY)
4250	mnt_flags \|= MNT_READONLY;
4251	if (attr_flags & MOUNT_ATTR_NOSUID)
4252	mnt_flags \|= MNT_NOSUID;
4253	if (attr_flags & MOUNT_ATTR_NODEV)
4254	mnt_flags \|= MNT_NODEV;
4255	if (attr_flags & MOUNT_ATTR_NOEXEC)
4256	mnt_flags \|= MNT_NOEXEC;
4257	if (attr_flags & MOUNT_ATTR_NODIRATIME)
4258	mnt_flags \|= MNT_NODIRATIME;
4259	if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
4260	mnt_flags \|= MNT_NOSYMFOLLOW;
4261
4262	return mnt_flags;
4263	}
4264
4265	/*
4266	* Create a kernel mount representation for a new, prepared superblock
4267	* (specified by fs_fd) and attach to an open_tree-like file descriptor.
4268	*/
4269	SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
4270	unsigned int, attr_flags)
4271	{
4272	struct path new_path __free(path_put) = {};
4273	struct mnt_namespace *ns;
4274	struct fs_context *fc;
4275	struct vfsmount *new_mnt;
4276	struct mount *mnt;
4277	unsigned int mnt_flags = `0`;
4278	long ret;
4279
4280	if (!may_mount())
4281	return -EPERM;
4282
4283	if ((flags & ~(FSMOUNT_CLOEXEC)) != `0`)
4284	return -EINVAL;
4285
4286	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
4287	return -EINVAL;
4288
4289	mnt_flags = attr_flags_to_mnt_flags(attr_flags);
4290
4291	switch (attr_flags & MOUNT_ATTR__ATIME) {
4292	case MOUNT_ATTR_STRICTATIME:
4293	break;
4294	case MOUNT_ATTR_NOATIME:
4295	mnt_flags \|= MNT_NOATIME;
4296	break;
4297	case MOUNT_ATTR_RELATIME:
4298	mnt_flags \|= MNT_RELATIME;
4299	break;
4300	default:
4301	return -EINVAL;
4302	}
4303
4304	CLASS(fd, f)(fd: fs_fd);
4305	if (fd_empty(f))
4306	return -EBADF;
4307
4308	if (fd_file(f)->f_op != &fscontext_fops)
4309	return -EINVAL;
4310
4311	fc = fd_file(f)->private_data;
4312
4313	ACQUIRE(mutex_intr, uapi_mutex)(T: &fc->uapi_mutex);
4314	ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex);
4315	if (ret)
4316	return ret;
4317
4318	/ There must be a valid superblock or we can't mount it /
4319	ret = -EINVAL;
4320	if (!fc->root)
4321	return ret;
4322
4323	ret = -EPERM;
4324	if (mount_too_revealing(sb: fc->root->d_sb, new_mnt_flags: &mnt_flags)) {
4325	errorfcp(fc, "VFS", "Mount too revealing");
4326	return ret;
4327	}
4328
4329	ret = -EBUSY;
4330	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
4331	return ret;
4332
4333	if (fc->sb_flags & SB_MANDLOCK)
4334	warn_mandlock();
4335
4336	new_mnt = vfs_create_mount(fc);
4337	if (IS_ERR(ptr: new_mnt))
4338	return PTR_ERR(ptr: new_mnt);
4339	new_mnt->mnt_flags = mnt_flags;
4340
4341	new_path.dentry = dget(dentry: fc->root);
4342	new_path.mnt = new_mnt;
4343
4344	/ We've done the mount bit - now move the file context into more or*
4345	* less the same state as if we'd done an fspick(). We don't want to
4346	* do any memory allocation or anything like that at this point as we
4347	* don't want to have to handle any errors incurred.
4348	*/
4349	vfs_clean_context(fc);
4350
4351	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, anon: true);
4352	if (IS_ERR(ptr: ns))
4353	return PTR_ERR(ptr: ns);
4354	mnt = real_mount(mnt: new_path.mnt);
4355	ns->root = mnt;
4356	ns->nr_mounts = `1`;
4357	mnt_add_to_ns(ns, mnt);
4358	mntget(new_path.mnt);
4359
4360	FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : `0`,
4361	dentry_open(&new_path, O_PATH, fc->cred));
4362	if (fdf.err) {
4363	dissolve_on_fput(mnt: new_path.mnt);
4364	return fdf.err;
4365	}
4366
4367	/*
4368	* Attach to an apparent O_PATH fd with a note that we
4369	* need to unmount it, not just simply put it.
4370	*/
4371	fd_prepare_file(fdf)->f_mode \|= FMODE_NEED_UNMOUNT;
4372	return fd_publish(fdf);
4373	}
4374
4375	static inline int vfs_move_mount(const struct path *from_path,
4376	const struct path *to_path,
4377	enum mnt_tree_flags_t mflags)
4378	{
4379	int ret;
4380
4381	ret = security_move_mount(from_path, to_path);
4382	if (ret)
4383	return ret;
4384
4385	if (mflags & MNT_TREE_PROPAGATION)
4386	return do_set_group(from_path, to_path);
4387
4388	return do_move_mount(old_path: from_path, new_path: to_path, flags: mflags);
4389	}
4390
4391	/*
4392	* Move a mount from one place to another. In combination with
4393	* fsopen()/fsmount() this is used to install a new mount and in combination
4394	* with open_tree(OPEN_TREE_CLONE [\| AT_RECURSIVE]) it can be used to copy
4395	* a mount subtree.
4396	*
4397	* Note the flags value is a combination of MOVE_MOUNT_* flags.
4398	*/
4399	SYSCALL_DEFINE5(move_mount,
4400	int, from_dfd, const char __user *, from_pathname,
4401	int, to_dfd, const char __user *, to_pathname,
4402	unsigned int, flags)
4403	{
4404	struct path to_path __free(path_put) = {};
4405	struct path from_path __free(path_put) = {};
4406	struct filename *to_name __free(putname) = NULL;
4407	struct filename *from_name __free(putname) = NULL;
4408	unsigned int lflags, uflags;
4409	enum mnt_tree_flags_t mflags = `0`;
4410	int ret = `0`;
4411
4412	if (!may_mount())
4413	return -EPERM;
4414
4415	if (flags & ~MOVE_MOUNT__MASK)
4416	return -EINVAL;
4417
4418	if ((flags & (MOVE_MOUNT_BENEATH \| MOVE_MOUNT_SET_GROUP)) ==
4419	(MOVE_MOUNT_BENEATH \| MOVE_MOUNT_SET_GROUP))
4420	return -EINVAL;
4421
4422	if (flags & MOVE_MOUNT_SET_GROUP) mflags \|= MNT_TREE_PROPAGATION;
4423	if (flags & MOVE_MOUNT_BENEATH) mflags \|= MNT_TREE_BENEATH;
4424
4425	uflags = `0`;
4426	if (flags & MOVE_MOUNT_T_EMPTY_PATH)
4427	uflags = AT_EMPTY_PATH;
4428
4429	to_name = getname_maybe_null(name: to_pathname, flags: uflags);
4430	if (IS_ERR(ptr: to_name))
4431	return PTR_ERR(ptr: to_name);
4432
4433	if (!to_name && to_dfd >= `0`) {
4434	CLASS(fd_raw, f_to)(fd: to_dfd);
4435	if (fd_empty(f: f_to))
4436	return -EBADF;
4437
4438	to_path = fd_file(f_to)->f_path;
4439	path_get(&to_path);
4440	} else {
4441	lflags = `0`;
4442	if (flags & MOVE_MOUNT_T_SYMLINKS)
4443	lflags \|= LOOKUP_FOLLOW;
4444	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)
4445	lflags \|= LOOKUP_AUTOMOUNT;
4446	ret = filename_lookup(dfd: to_dfd, name: to_name, flags: lflags, path: &to_path, NULL);
4447	if (ret)
4448	return ret;
4449	}
4450
4451	uflags = `0`;
4452	if (flags & MOVE_MOUNT_F_EMPTY_PATH)
4453	uflags = AT_EMPTY_PATH;
4454
4455	from_name = getname_maybe_null(name: from_pathname, flags: uflags);
4456	if (IS_ERR(ptr: from_name))
4457	return PTR_ERR(ptr: from_name);
4458
4459	if (!from_name && from_dfd >= `0`) {
4460	CLASS(fd_raw, f_from)(fd: from_dfd);
4461	if (fd_empty(f: f_from))
4462	return -EBADF;
4463
4464	return vfs_move_mount(from_path: &fd_file(f_from)->f_path, to_path: &to_path, mflags);
4465	}
4466
4467	lflags = `0`;
4468	if (flags & MOVE_MOUNT_F_SYMLINKS)
4469	lflags \|= LOOKUP_FOLLOW;
4470	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)
4471	lflags \|= LOOKUP_AUTOMOUNT;
4472	ret = filename_lookup(dfd: from_dfd, name: from_name, flags: lflags, path: &from_path, NULL);
4473	if (ret)
4474	return ret;
4475
4476	return vfs_move_mount(from_path: &from_path, to_path: &to_path, mflags);
4477	}
4478
4479	/*
4480	* Return true if path is reachable from root
4481	*
4482	* locks: mount_locked_reader \|\| namespace_shared && is_mounted(mnt)
4483	*/
4484	bool is_path_reachable(struct mount mnt, struct* dentry *dentry,
4485	const struct path *root)
4486	{
4487	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
4488	dentry = mnt->mnt_mountpoint;
4489	mnt = mnt->mnt_parent;
4490	}
4491	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
4492	}
4493
4494	bool path_is_under(const struct path path1, const* struct path *path2)
4495	{
4496	guard(mount_locked_reader)();
4497	return is_path_reachable(mnt: real_mount(mnt: path1->mnt), dentry: path1->dentry, root: path2);
4498	}
4499	EXPORT_SYMBOL(path_is_under);
4500
4501	/*
4502	* pivot_root Semantics:
4503	* Moves the root file system of the current process to the directory put_old,
4504	* makes new_root as the new root file system of the current process, and sets
4505	* root/cwd of all processes which had them on the current root to new_root.
4506	*
4507	* Restrictions:
4508	* The new_root and put_old must be directories, and must not be on the
4509	* same file system as the current process root. The put_old must be
4510	* underneath new_root, i.e. adding a non-zero number of /.. to the string
4511	* pointed to by put_old must yield the same directory as new_root. No other
4512	* file system may be mounted on put_old. After all, new_root is a mountpoint.
4513	*
4514	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
4515	* See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
4516	* in this situation.
4517	*
4518	* Notes:
4519	* - we don't move root/cwd if they are not at the root (reason: if something
4520	* cared enough to change them, it's probably wrong to force them elsewhere)
4521	* - it's okay to pick a root that isn't the root of a file system, e.g.
4522	* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
4523	* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
4524	* first.
4525	*/
4526	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
4527	const char __user *, put_old)
4528	{
4529	struct path new __free(path_put) = {};
4530	struct path old __free(path_put) = {};
4531	struct path root __free(path_put) = {};
4532	struct mount new_mnt, root_mnt, old_mnt, root_parent, *ex_parent;
4533	int error;
4534
4535	if (!may_mount())
4536	return -EPERM;
4537
4538	error = user_path_at(AT_FDCWD, new_root,
4539	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &new);
4540	if (error)
4541	return error;
4542
4543	error = user_path_at(AT_FDCWD, put_old,
4544	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &old);
4545	if (error)
4546	return error;
4547
4548	error = security_sb_pivotroot(old_path: &old, new_path: &new);
4549	if (error)
4550	return error;
4551
4552	get_fs_root(current->fs, root: &root);
4553
4554	LOCK_MOUNT(old_mp, &old);
4555	old_mnt = old_mp.parent;
4556	if (IS_ERR(ptr: old_mnt))
4557	return PTR_ERR(ptr: old_mnt);
4558
4559	new_mnt = real_mount(mnt: new.mnt);
4560	root_mnt = real_mount(mnt: root.mnt);
4561	ex_parent = new_mnt->mnt_parent;
4562	root_parent = root_mnt->mnt_parent;
4563	if (IS_MNT_SHARED(old_mnt) \|\|
4564	IS_MNT_SHARED(ex_parent) \|\|
4565	IS_MNT_SHARED(root_parent))
4566	return -EINVAL;
4567	if (!check_mnt(mnt: root_mnt) \|\| !check_mnt(mnt: new_mnt))
4568	return -EINVAL;
4569	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
4570	return -EINVAL;
4571	if (d_unlinked(dentry: new.dentry))
4572	return -ENOENT;
4573	if (new_mnt == root_mnt \|\| old_mnt == root_mnt)
4574	return -EBUSY; / loop, on the same file system /
4575	if (!path_mounted(path: &root))
4576	return -EINVAL; / not a mountpoint /
4577	if (!mnt_has_parent(mnt: root_mnt))
4578	return -EINVAL; / absolute root /
4579	if (!path_mounted(path: &new))
4580	return -EINVAL; / not a mountpoint /
4581	if (!mnt_has_parent(mnt: new_mnt))
4582	return -EINVAL; / absolute root /
4583	/ make sure we can reach put_old from new_root /
4584	if (!is_path_reachable(mnt: old_mnt, dentry: old_mp.mp->m_dentry, root: &new))
4585	return -EINVAL;
4586	/ make certain new is below the root /
4587	if (!is_path_reachable(mnt: new_mnt, dentry: new.dentry, root: &root))
4588	return -EINVAL;
4589	lock_mount_hash();
4590	umount_mnt(mnt: new_mnt);
4591	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
4592	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
4593	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
4594	}
4595	/ mount new_root on / /
4596	attach_mnt(mnt: new_mnt, parent: root_parent, mp: root_mnt->mnt_mp);
4597	umount_mnt(mnt: root_mnt);
4598	/ mount old root on put_old /
4599	attach_mnt(mnt: root_mnt, parent: old_mnt, mp: old_mp.mp);
4600	touch_mnt_namespace(current->nsproxy->mnt_ns);
4601	/ A moved mount should not expire automatically /
4602	list_del_init(entry: &new_mnt->mnt_expire);
4603	unlock_mount_hash();
4604	mnt_notify_add(m: root_mnt);
4605	mnt_notify_add(m: new_mnt);
4606	chroot_fs_refs(&root, &new);
4607	return `0`;
4608	}
4609
4610	static unsigned int recalc_flags(struct mount_kattr kattr, struct* mount *mnt)
4611	{
4612	unsigned int flags = mnt->mnt.mnt_flags;
4613
4614	/ flags to clear /
4615	flags &= ~kattr->attr_clr;
4616	/ flags to raise /
4617	flags \|= kattr->attr_set;
4618
4619	return flags;
4620	}
4621
4622	static int can_idmap_mount(const struct mount_kattr kattr, struct* mount *mnt)
4623	{
4624	struct vfsmount *m = &mnt->mnt;
4625	struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
4626
4627	if (!kattr->mnt_idmap)
4628	return `0`;
4629
4630	/*
4631	* Creating an idmapped mount with the filesystem wide idmapping
4632	* doesn't make sense so block that. We don't allow mushy semantics.
4633	*/
4634	if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
4635	return -EINVAL;
4636
4637	/*
4638	* We only allow an mount to change it's idmapping if it has
4639	* never been accessible to userspace.
4640	*/
4641	if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(mnt: m))
4642	return -EPERM;
4643
4644	/ The underlying filesystem doesn't support idmapped mounts yet. /
4645	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4646	return -EINVAL;
4647
4648	/ The filesystem has turned off idmapped mounts. /
4649	if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
4650	return -EINVAL;
4651
4652	/ We're not controlling the superblock. /
4653	if (!ns_capable(ns: fs_userns, CAP_SYS_ADMIN))
4654	return -EPERM;
4655
4656	/ Mount has already been visible in the filesystem hierarchy. /
4657	if (!is_anon_ns(ns: mnt->mnt_ns))
4658	return -EINVAL;
4659
4660	return `0`;
4661	}
4662
4663	/**
4664	* mnt_allow_writers() - check whether the attribute change allows writers
4665	* @kattr: the new mount attributes
4666	* @mnt: the mount to which @kattr will be applied
4667	*
4668	* Check whether thew new mount attributes in @kattr allow concurrent writers.
4669	*
4670	* Return: true if writers need to be held, false if not
4671	*/
4672	static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4673	const struct mount *mnt)
4674	{
4675	return (!(kattr->attr_set & MNT_READONLY) \|\|
4676	(mnt->mnt.mnt_flags & MNT_READONLY)) &&
4677	!kattr->mnt_idmap;
4678	}
4679
4680	static int mount_setattr_prepare(struct mount_kattr kattr, struct* mount *mnt)
4681	{
4682	struct mount *m;
4683	int err;
4684
4685	for (m = mnt; m; m = next_mnt(p: m, root: mnt)) {
4686	if (!can_change_locked_flags(mnt: m, mnt_flags: recalc_flags(kattr, mnt: m))) {
4687	err = -EPERM;
4688	break;
4689	}
4690
4691	err = can_idmap_mount(kattr, mnt: m);
4692	if (err)
4693	break;
4694
4695	if (!mnt_allow_writers(kattr, mnt: m)) {
4696	err = mnt_hold_writers(mnt: m);
4697	if (err) {
4698	m = next_mnt(p: m, root: mnt);
4699	break;
4700	}
4701	}
4702
4703	if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4704	return `0`;
4705	}
4706
4707	if (err) {
4708	/ undo all mnt_hold_writers() we'd done /
4709	for (struct mount *p = mnt; p != m; p = next_mnt(p, root: mnt))
4710	mnt_unhold_writers(mnt: p);
4711	}
4712	return err;
4713	}
4714
4715	static void do_idmap_mount(const struct mount_kattr kattr, struct* mount *mnt)
4716	{
4717	struct mnt_idmap *old_idmap;
4718
4719	if (!kattr->mnt_idmap)
4720	return;
4721
4722	old_idmap = mnt_idmap(mnt: &mnt->mnt);
4723
4724	/ Pairs with smp_load_acquire() in mnt_idmap(). /
4725	smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4726	mnt_idmap_put(idmap: old_idmap);
4727	}
4728
4729	static void mount_setattr_commit(struct mount_kattr kattr, struct* mount *mnt)
4730	{
4731	struct mount *m;
4732
4733	for (m = mnt; m; m = next_mnt(p: m, root: mnt)) {
4734	unsigned int flags;
4735
4736	do_idmap_mount(kattr, mnt: m);
4737	flags = recalc_flags(kattr, mnt: m);
4738	WRITE_ONCE(m->mnt.mnt_flags, flags);
4739
4740	/ If we had to hold writers unblock them. /
4741	mnt_unhold_writers(mnt: m);
4742
4743	if (kattr->propagation)
4744	change_mnt_propagation(m, kattr->propagation);
4745	if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4746	break;
4747	}
4748	touch_mnt_namespace(ns: mnt->mnt_ns);
4749	}
4750
4751	static int do_mount_setattr(const struct path path, struct* mount_kattr *kattr)
4752	{
4753	struct mount *mnt = real_mount(mnt: path->mnt);
4754	int err = `0`;
4755
4756	if (!path_mounted(path))
4757	return -EINVAL;
4758
4759	if (kattr->mnt_userns) {
4760	struct mnt_idmap *mnt_idmap;
4761
4762	mnt_idmap = alloc_mnt_idmap(mnt_userns: kattr->mnt_userns);
4763	if (IS_ERR(ptr: mnt_idmap))
4764	return PTR_ERR(ptr: mnt_idmap);
4765	kattr->mnt_idmap = mnt_idmap;
4766	}
4767
4768	if (kattr->propagation) {
4769	/*
4770	* Only take namespace_lock() if we're actually changing
4771	* propagation.
4772	*/
4773	namespace_lock();
4774	if (kattr->propagation == MS_SHARED) {
4775	err = invent_group_ids(mnt, recurse: kattr->kflags & MOUNT_KATTR_RECURSE);
4776	if (err) {
4777	namespace_unlock();
4778	return err;
4779	}
4780	}
4781	}
4782
4783	err = -EINVAL;
4784	lock_mount_hash();
4785
4786	if (!anon_ns_root(m: mnt) && !check_mnt(mnt))
4787	goto out;
4788
4789	/*
4790	* First, we get the mount tree in a shape where we can change mount
4791	* properties without failure. If we succeeded to do so we commit all
4792	* changes and if we failed we clean up.
4793	*/
4794	err = mount_setattr_prepare(kattr, mnt);
4795	if (!err)
4796	mount_setattr_commit(kattr, mnt);
4797
4798	out:
4799	unlock_mount_hash();
4800
4801	if (kattr->propagation) {
4802	if (err)
4803	cleanup_group_ids(mnt, NULL);
4804	namespace_unlock();
4805	}
4806
4807	return err;
4808	}
4809
4810	static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4811	struct mount_kattr *kattr)
4812	{
4813	struct ns_common *ns;
4814	struct user_namespace *mnt_userns;
4815
4816	if (!((attr->attr_set \| attr->attr_clr) & MOUNT_ATTR_IDMAP))
4817	return `0`;
4818
4819	if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
4820	/*
4821	* We can only remove an idmapping if it's never been
4822	* exposed to userspace.
4823	*/
4824	if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
4825	return -EINVAL;
4826
4827	/*
4828	* Removal of idmappings is equivalent to setting
4829	* nop_mnt_idmap.
4830	*/
4831	if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
4832	kattr->mnt_idmap = &nop_mnt_idmap;
4833	return `0`;
4834	}
4835	}
4836
4837	if (attr->userns_fd > INT_MAX)
4838	return -EINVAL;
4839
4840	CLASS(fd, f)(fd: attr->userns_fd);
4841	if (fd_empty(f))
4842	return -EBADF;
4843
4844	if (!proc_ns_file(fd_file(f)))
4845	return -EINVAL;
4846
4847	ns = get_proc_ns(file_inode(fd_file(f)));
4848	if (ns->ns_type != CLONE_NEWUSER)
4849	return -EINVAL;
4850
4851	/*
4852	* The initial idmapping cannot be used to create an idmapped
4853	* mount. We use the initial idmapping as an indicator of a mount
4854	* that is not idmapped. It can simply be passed into helpers that
4855	* are aware of idmapped mounts as a convenient shortcut. A user
4856	* can just create a dedicated identity mapping to achieve the same
4857	* result.
4858	*/
4859	mnt_userns = container_of(ns, struct user_namespace, ns);
4860	if (mnt_userns == &init_user_ns)
4861	return -EPERM;
4862
4863	/ We're not controlling the target namespace. /
4864	if (!ns_capable(ns: mnt_userns, CAP_SYS_ADMIN))
4865	return -EPERM;
4866
4867	kattr->mnt_userns = get_user_ns(ns: mnt_userns);
4868	return `0`;
4869	}
4870
4871	static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
4872	struct mount_kattr *kattr)
4873	{
4874	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
4875	return -EINVAL;
4876	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > `1`)
4877	return -EINVAL;
4878	kattr->propagation = attr->propagation;
4879
4880	if ((attr->attr_set \| attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
4881	return -EINVAL;
4882
4883	kattr->attr_set = attr_flags_to_mnt_flags(attr_flags: attr->attr_set);
4884	kattr->attr_clr = attr_flags_to_mnt_flags(attr_flags: attr->attr_clr);
4885
4886	/*
4887	* Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
4888	* users wanting to transition to a different atime setting cannot
4889	* simply specify the atime setting in @attr_set, but must also
4890	* specify MOUNT_ATTR__ATIME in the @attr_clr field.
4891	* So ensure that MOUNT_ATTR__ATIME can't be partially set in
4892	* @attr_clr and that @attr_set can't have any atime bits set if
4893	* MOUNT_ATTR__ATIME isn't set in @attr_clr.
4894	*/
4895	if (attr->attr_clr & MOUNT_ATTR__ATIME) {
4896	if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
4897	return -EINVAL;
4898
4899	/*
4900	* Clear all previous time settings as they are mutually
4901	* exclusive.
4902	*/
4903	kattr->attr_clr \|= MNT_RELATIME \| MNT_NOATIME;
4904	switch (attr->attr_set & MOUNT_ATTR__ATIME) {
4905	case MOUNT_ATTR_RELATIME:
4906	kattr->attr_set \|= MNT_RELATIME;
4907	break;
4908	case MOUNT_ATTR_NOATIME:
4909	kattr->attr_set \|= MNT_NOATIME;
4910	break;
4911	case MOUNT_ATTR_STRICTATIME:
4912	break;
4913	default:
4914	return -EINVAL;
4915	}
4916	} else {
4917	if (attr->attr_set & MOUNT_ATTR__ATIME)
4918	return -EINVAL;
4919	}
4920
4921	return build_mount_idmapped(attr, usize, kattr);
4922	}
4923
4924	static void finish_mount_kattr(struct mount_kattr *kattr)
4925	{
4926	if (kattr->mnt_userns) {
4927	put_user_ns(ns: kattr->mnt_userns);
4928	kattr->mnt_userns = NULL;
4929	}
4930
4931	if (kattr->mnt_idmap)
4932	mnt_idmap_put(idmap: kattr->mnt_idmap);
4933	}
4934
4935	static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
4936	struct mount_kattr *kattr)
4937	{
4938	int ret;
4939	struct mount_attr attr;
4940
4941	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
4942
4943	if (unlikely(usize > PAGE_SIZE))
4944	return -E2BIG;
4945	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
4946	return -EINVAL;
4947
4948	if (!may_mount())
4949	return -EPERM;
4950
4951	ret = copy_struct_from_user(dst: &attr, ksize: sizeof(attr), src: uattr, usize);
4952	if (ret)
4953	return ret;
4954
4955	/ Don't bother walking through the mounts if this is a nop. /
4956	if (attr.attr_set == `0` &&
4957	attr.attr_clr == `0` &&
4958	attr.propagation == `0`)
4959	return `0`; / Tell caller to not bother. /
4960
4961	ret = build_mount_kattr(attr: &attr, usize, kattr);
4962	if (ret < `0`)
4963	return ret;
4964
4965	return `1`;
4966	}
4967
4968	SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
4969	unsigned int, flags, struct mount_attr __user *, uattr,
4970	size_t, usize)
4971	{
4972	int err;
4973	struct path target;
4974	struct mount_kattr kattr;
4975	unsigned int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
4976
4977	if (flags & ~(AT_EMPTY_PATH \|
4978	AT_RECURSIVE \|
4979	AT_SYMLINK_NOFOLLOW \|
4980	AT_NO_AUTOMOUNT))
4981	return -EINVAL;
4982
4983	if (flags & AT_NO_AUTOMOUNT)
4984	lookup_flags &= ~LOOKUP_AUTOMOUNT;
4985	if (flags & AT_SYMLINK_NOFOLLOW)
4986	lookup_flags &= ~LOOKUP_FOLLOW;
4987	if (flags & AT_EMPTY_PATH)
4988	lookup_flags \|= LOOKUP_EMPTY;
4989
4990	kattr = (struct mount_kattr) {
4991	.lookup_flags = lookup_flags,
4992	};
4993
4994	if (flags & AT_RECURSIVE)
4995	kattr.kflags \|= MOUNT_KATTR_RECURSE;
4996
4997	err = wants_mount_setattr(uattr, usize, kattr: &kattr);
4998	if (err <= `0`)
4999	return err;
5000
5001	err = user_path_at(dfd, path, kattr.lookup_flags, &target);
5002	if (!err) {
5003	err = do_mount_setattr(path: &target, kattr: &kattr);
5004	path_put(&target);
5005	}
5006	finish_mount_kattr(kattr: &kattr);
5007	return err;
5008	}
5009
5010	SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
5011	unsigned, flags, struct mount_attr __user *, uattr,
5012	size_t, usize)
5013	{
5014	if (!uattr && usize)
5015	return -EINVAL;
5016
5017	FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags));
5018	if (fdf.err)
5019	return fdf.err;
5020
5021	if (uattr) {
5022	struct mount_kattr kattr = {};
5023	struct file *file = fd_prepare_file(fdf);
5024	int ret;
5025
5026	if (flags & OPEN_TREE_CLONE)
5027	kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
5028	if (flags & AT_RECURSIVE)
5029	kattr.kflags \|= MOUNT_KATTR_RECURSE;
5030
5031	ret = wants_mount_setattr(uattr, usize, kattr: &kattr);
5032	if (ret > `0`) {
5033	ret = do_mount_setattr(path: &file->f_path, kattr: &kattr);
5034	finish_mount_kattr(kattr: &kattr);
5035	}
5036	if (ret)
5037	return ret;
5038	}
5039
5040	return fd_publish(fdf);
5041	}
5042
5043	int show_path(struct seq_file m, struct* dentry *root)
5044	{
5045	if (root->d_sb->s_op->show_path)
5046	return root->d_sb->s_op->show_path(m, root);
5047
5048	seq_dentry(m, root, " \t\n\\");
5049	return `0`;
5050	}
5051
5052	static struct vfsmount lookup_mnt_in_ns(u64 id, struct* mnt_namespace *ns)
5053	{
5054	struct mount *mnt = mnt_find_id_at(ns, mnt_id: id);
5055
5056	if (!mnt \|\| mnt->mnt_id_unique != id)
5057	return NULL;
5058
5059	return &mnt->mnt;
5060	}
5061
5062	struct kstatmount {
5063	struct statmount __user *buf;
5064	size_t bufsize;
5065	struct vfsmount *mnt;
5066	struct mnt_idmap *idmap;
5067	u64 mask;
5068	struct path root;
5069	struct seq_file seq;
5070
5071	/ Must be last --ends in a flexible-array member. /
5072	struct statmount sm;
5073	};
5074
5075	static u64 mnt_to_attr_flags(struct vfsmount *mnt)
5076	{
5077	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
5078	u64 attr_flags = `0`;
5079
5080	if (mnt_flags & MNT_READONLY)
5081	attr_flags \|= MOUNT_ATTR_RDONLY;
5082	if (mnt_flags & MNT_NOSUID)
5083	attr_flags \|= MOUNT_ATTR_NOSUID;
5084	if (mnt_flags & MNT_NODEV)
5085	attr_flags \|= MOUNT_ATTR_NODEV;
5086	if (mnt_flags & MNT_NOEXEC)
5087	attr_flags \|= MOUNT_ATTR_NOEXEC;
5088	if (mnt_flags & MNT_NODIRATIME)
5089	attr_flags \|= MOUNT_ATTR_NODIRATIME;
5090	if (mnt_flags & MNT_NOSYMFOLLOW)
5091	attr_flags \|= MOUNT_ATTR_NOSYMFOLLOW;
5092
5093	if (mnt_flags & MNT_NOATIME)
5094	attr_flags \|= MOUNT_ATTR_NOATIME;
5095	else if (mnt_flags & MNT_RELATIME)
5096	attr_flags \|= MOUNT_ATTR_RELATIME;
5097	else
5098	attr_flags \|= MOUNT_ATTR_STRICTATIME;
5099
5100	if (is_idmapped_mnt(mnt))
5101	attr_flags \|= MOUNT_ATTR_IDMAP;
5102
5103	return attr_flags;
5104	}
5105
5106	static u64 mnt_to_propagation_flags(struct mount *m)
5107	{
5108	u64 propagation = `0`;
5109
5110	if (IS_MNT_SHARED(m))
5111	propagation \|= MS_SHARED;
5112	if (IS_MNT_SLAVE(m))
5113	propagation \|= MS_SLAVE;
5114	if (IS_MNT_UNBINDABLE(m))
5115	propagation \|= MS_UNBINDABLE;
5116	if (!propagation)
5117	propagation \|= MS_PRIVATE;
5118
5119	return propagation;
5120	}
5121
5122	u64 vfsmount_to_propagation_flags(struct vfsmount *mnt)
5123	{
5124	return mnt_to_propagation_flags(m: real_mount(mnt));
5125	}
5126	EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags);
5127
5128	static void statmount_sb_basic(struct kstatmount *s)
5129	{
5130	struct super_block *sb = s->mnt->mnt_sb;
5131
5132	s->sm.mask \|= STATMOUNT_SB_BASIC;
5133	s->sm.sb_dev_major = MAJOR(sb->s_dev);
5134	s->sm.sb_dev_minor = MINOR(sb->s_dev);
5135	s->sm.sb_magic = sb->s_magic;
5136	s->sm.sb_flags = sb->s_flags & (SB_RDONLY\|SB_SYNCHRONOUS\|SB_DIRSYNC\|SB_LAZYTIME);
5137	}
5138
5139	static void statmount_mnt_basic(struct kstatmount *s)
5140	{
5141	struct mount *m = real_mount(mnt: s->mnt);
5142
5143	s->sm.mask \|= STATMOUNT_MNT_BASIC;
5144	s->sm.mnt_id = m->mnt_id_unique;
5145	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
5146	s->sm.mnt_id_old = m->mnt_id;
5147	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
5148	s->sm.mnt_attr = mnt_to_attr_flags(mnt: &m->mnt);
5149	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
5150	s->sm.mnt_peer_group = m->mnt_group_id;
5151	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : `0`;
5152	}
5153
5154	static void statmount_propagate_from(struct kstatmount *s)
5155	{
5156	struct mount *m = real_mount(mnt: s->mnt);
5157
5158	s->sm.mask \|= STATMOUNT_PROPAGATE_FROM;
5159	if (IS_MNT_SLAVE(m))
5160	s->sm.propagate_from = get_dominating_id(mnt: m, root: &current->fs->root);
5161	}
5162
5163	static int statmount_mnt_root(struct kstatmount s, struct* seq_file *seq)
5164	{
5165	int ret;
5166	size_t start = seq->count;
5167
5168	ret = show_path(m: seq, root: s->mnt->mnt_root);
5169	if (ret)
5170	return ret;
5171
5172	if (unlikely(seq_has_overflowed(seq)))
5173	return -EAGAIN;
5174
5175	/*
5176	* Unescape the result. It would be better if supplied string was not
5177	* escaped in the first place, but that's a pretty invasive change.
5178	*/
5179	seq->buf[seq->count] = `'\0'`;
5180	seq->count = start;
5181	seq_commit(m: seq, num: string_unescape_inplace(buf: seq->buf + start, UNESCAPE_OCTAL));
5182	return `0`;
5183	}
5184
5185	static int statmount_mnt_point(struct kstatmount s, struct* seq_file *seq)
5186	{
5187	struct vfsmount *mnt = s->mnt;
5188	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
5189	int err;
5190
5191	err = seq_path_root(m: seq, path: &mnt_path, root: &s->root, esc: "");
5192	return err == SEQ_SKIP ? `0` : err;
5193	}
5194
5195	static int statmount_fs_type(struct kstatmount s, struct* seq_file *seq)
5196	{
5197	struct super_block *sb = s->mnt->mnt_sb;
5198
5199	seq_puts(m: seq, s: sb->s_type->name);
5200	return `0`;
5201	}
5202
5203	static void statmount_fs_subtype(struct kstatmount s, struct* seq_file *seq)
5204	{
5205	struct super_block *sb = s->mnt->mnt_sb;
5206
5207	if (sb->s_subtype)
5208	seq_puts(m: seq, s: sb->s_subtype);
5209	}
5210
5211	static int statmount_sb_source(struct kstatmount s, struct* seq_file *seq)
5212	{
5213	struct super_block *sb = s->mnt->mnt_sb;
5214	struct mount *r = real_mount(mnt: s->mnt);
5215
5216	if (sb->s_op->show_devname) {
5217	size_t start = seq->count;
5218	int ret;
5219
5220	ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
5221	if (ret)
5222	return ret;
5223
5224	if (unlikely(seq_has_overflowed(seq)))
5225	return -EAGAIN;
5226
5227	/ Unescape the result /
5228	seq->buf[seq->count] = `'\0'`;
5229	seq->count = start;
5230	seq_commit(m: seq, num: string_unescape_inplace(buf: seq->buf + start, UNESCAPE_OCTAL));
5231	} else {
5232	seq_puts(m: seq, s: r->mnt_devname);
5233	}
5234	return `0`;
5235	}
5236
5237	static void statmount_mnt_ns_id(struct kstatmount s, struct* mnt_namespace *ns)
5238	{
5239	s->sm.mask \|= STATMOUNT_MNT_NS_ID;
5240	s->sm.mnt_ns_id = ns->ns.ns_id;
5241	}
5242
5243	static int statmount_mnt_opts(struct kstatmount s, struct* seq_file *seq)
5244	{
5245	struct vfsmount *mnt = s->mnt;
5246	struct super_block *sb = mnt->mnt_sb;
5247	size_t start = seq->count;
5248	int err;
5249
5250	err = security_sb_show_options(m: seq, sb);
5251	if (err)
5252	return err;
5253
5254	if (sb->s_op->show_options) {
5255	err = sb->s_op->show_options(seq, mnt->mnt_root);
5256	if (err)
5257	return err;
5258	}
5259
5260	if (unlikely(seq_has_overflowed(seq)))
5261	return -EAGAIN;
5262
5263	if (seq->count == start)
5264	return `0`;
5265
5266	/ skip leading comma /
5267	memmove(seq->buf + start, seq->buf + start + `1`,
5268	seq->count - start - `1`);
5269	seq->count--;
5270
5271	return `0`;
5272	}
5273
5274	static inline int statmount_opt_process(struct seq_file *seq, size_t start)
5275	{
5276	char buf_end, opt_end, src, dst;
5277	int count = `0`;
5278
5279	if (unlikely(seq_has_overflowed(seq)))
5280	return -EAGAIN;
5281
5282	buf_end = seq->buf + seq->count;
5283	dst = seq->buf + start;
5284	src = dst + `1`; / skip initial comma /
5285
5286	if (src >= buf_end) {
5287	seq->count = start;
5288	return `0`;
5289	}
5290
5291	*buf_end = `'\0'`;
5292	for (; src < buf_end; src = opt_end + `1`) {
5293	opt_end = strchrnul(src, `','`);
5294	*opt_end = `'\0'`;
5295	dst += string_unescape(src, dst, size: `0`, UNESCAPE_OCTAL) + `1`;
5296	if (WARN_ON_ONCE(++count == INT_MAX))
5297	return -EOVERFLOW;
5298	}
5299	seq->count = dst - `1` - seq->buf;
5300	return count;
5301	}
5302
5303	static int statmount_opt_array(struct kstatmount s, struct* seq_file *seq)
5304	{
5305	struct vfsmount *mnt = s->mnt;
5306	struct super_block *sb = mnt->mnt_sb;
5307	size_t start = seq->count;
5308	int err;
5309
5310	if (!sb->s_op->show_options)
5311	return `0`;
5312
5313	err = sb->s_op->show_options(seq, mnt->mnt_root);
5314	if (err)
5315	return err;
5316
5317	err = statmount_opt_process(seq, start);
5318	if (err < `0`)
5319	return err;
5320
5321	s->sm.opt_num = err;
5322	return `0`;
5323	}
5324
5325	static int statmount_opt_sec_array(struct kstatmount s, struct* seq_file *seq)
5326	{
5327	struct vfsmount *mnt = s->mnt;
5328	struct super_block *sb = mnt->mnt_sb;
5329	size_t start = seq->count;
5330	int err;
5331
5332	err = security_sb_show_options(m: seq, sb);
5333	if (err)
5334	return err;
5335
5336	err = statmount_opt_process(seq, start);
5337	if (err < `0`)
5338	return err;
5339
5340	s->sm.opt_sec_num = err;
5341	return `0`;
5342	}
5343
5344	static inline int statmount_mnt_uidmap(struct kstatmount s, struct* seq_file *seq)
5345	{
5346	int ret;
5347
5348	ret = statmount_mnt_idmap(idmap: s->idmap, seq, uid_map: true);
5349	if (ret < `0`)
5350	return ret;
5351
5352	s->sm.mnt_uidmap_num = ret;
5353	/*
5354	* Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
5355	* mappings. This allows userspace to distinguish between a
5356	* non-idmapped mount and an idmapped mount where none of the
5357	* individual mappings are valid in the caller's idmapping.
5358	*/
5359	if (is_valid_mnt_idmap(idmap: s->idmap))
5360	s->sm.mask \|= STATMOUNT_MNT_UIDMAP;
5361	return `0`;
5362	}
5363
5364	static inline int statmount_mnt_gidmap(struct kstatmount s, struct* seq_file *seq)
5365	{
5366	int ret;
5367
5368	ret = statmount_mnt_idmap(idmap: s->idmap, seq, uid_map: false);
5369	if (ret < `0`)
5370	return ret;
5371
5372	s->sm.mnt_gidmap_num = ret;
5373	/*
5374	* Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
5375	* mappings. This allows userspace to distinguish between a
5376	* non-idmapped mount and an idmapped mount where none of the
5377	* individual mappings are valid in the caller's idmapping.
5378	*/
5379	if (is_valid_mnt_idmap(idmap: s->idmap))
5380	s->sm.mask \|= STATMOUNT_MNT_GIDMAP;
5381	return `0`;
5382	}
5383
5384	static int statmount_string(struct kstatmount *s, u64 flag)
5385	{
5386	int ret = `0`;
5387	size_t kbufsize;
5388	struct seq_file *seq = &s->seq;
5389	struct statmount *sm = &s->sm;
5390	u32 start, *offp;
5391
5392	/ Reserve an empty string at the beginning for any unset offsets /
5393	if (!seq->count)
5394	seq_putc(m: seq, c: `0`);
5395
5396	start = seq->count;
5397
5398	switch (flag) {
5399	case STATMOUNT_FS_TYPE:
5400	offp = &sm->fs_type;
5401	ret = statmount_fs_type(s, seq);
5402	break;
5403	case STATMOUNT_MNT_ROOT:
5404	offp = &sm->mnt_root;
5405	ret = statmount_mnt_root(s, seq);
5406	break;
5407	case STATMOUNT_MNT_POINT:
5408	offp = &sm->mnt_point;
5409	ret = statmount_mnt_point(s, seq);
5410	break;
5411	case STATMOUNT_MNT_OPTS:
5412	offp = &sm->mnt_opts;
5413	ret = statmount_mnt_opts(s, seq);
5414	break;
5415	case STATMOUNT_OPT_ARRAY:
5416	offp = &sm->opt_array;
5417	ret = statmount_opt_array(s, seq);
5418	break;
5419	case STATMOUNT_OPT_SEC_ARRAY:
5420	offp = &sm->opt_sec_array;
5421	ret = statmount_opt_sec_array(s, seq);
5422	break;
5423	case STATMOUNT_FS_SUBTYPE:
5424	offp = &sm->fs_subtype;
5425	statmount_fs_subtype(s, seq);
5426	break;
5427	case STATMOUNT_SB_SOURCE:
5428	offp = &sm->sb_source;
5429	ret = statmount_sb_source(s, seq);
5430	break;
5431	case STATMOUNT_MNT_UIDMAP:
5432	offp = &sm->mnt_uidmap;
5433	ret = statmount_mnt_uidmap(s, seq);
5434	break;
5435	case STATMOUNT_MNT_GIDMAP:
5436	offp = &sm->mnt_gidmap;
5437	ret = statmount_mnt_gidmap(s, seq);
5438	break;
5439	default:
5440	WARN_ON_ONCE(true);
5441	return -EINVAL;
5442	}
5443
5444	/*
5445	* If nothing was emitted, return to avoid setting the flag
5446	* and terminating the buffer.
5447	*/
5448	if (seq->count == start)
5449	return ret;
5450	if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
5451	return -EOVERFLOW;
5452	if (kbufsize >= s->bufsize)
5453	return -EOVERFLOW;
5454
5455	/ signal a retry /
5456	if (unlikely(seq_has_overflowed(seq)))
5457	return -EAGAIN;
5458
5459	if (ret)
5460	return ret;
5461
5462	seq->buf[seq->count++] = `'\0'`;
5463	sm->mask \|= flag;
5464	*offp = start;
5465	return `0`;
5466	}
5467
5468	static int copy_statmount_to_user(struct kstatmount *s)
5469	{
5470	struct statmount *sm = &s->sm;
5471	struct seq_file *seq = &s->seq;
5472	char __user str = ((char* __user )s->buf) + sizeof(sm);
5473	size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
5474
5475	if (seq->count && copy_to_user(to: str, from: seq->buf, n: seq->count))
5476	return -EFAULT;
5477
5478	/ Return the number of bytes copied to the buffer /
5479	sm->size = copysize + seq->count;
5480	if (copy_to_user(to: s->buf, from: sm, n: copysize))
5481	return -EFAULT;
5482
5483	return `0`;
5484	}
5485
5486	static struct mount listmnt_next(struct* mount *curr, bool reverse)
5487	{
5488	struct rb_node *node;
5489
5490	if (reverse)
5491	node = rb_prev(&curr->mnt_node);
5492	else
5493	node = rb_next(&curr->mnt_node);
5494
5495	return node_to_mount(node);
5496	}
5497
5498	static int grab_requested_root(struct mnt_namespace ns, struct* path *root)
5499	{
5500	struct mount first, child;
5501
5502	rwsem_assert_held(sem: &namespace_sem);
5503
5504	/ We're looking at our own ns, just use get_fs_root. /
5505	if (ns == current->nsproxy->mnt_ns) {
5506	get_fs_root(current->fs, root);
5507	return `0`;
5508	}
5509
5510	/*
5511	* We have to find the first mount in our ns and use that, however it
5512	* may not exist, so handle that properly.
5513	*/
5514	if (mnt_ns_empty(ns))
5515	return -ENOENT;
5516
5517	first = child = ns->root;
5518	for (;;) {
5519	child = listmnt_next(curr: child, reverse: false);
5520	if (!child)
5521	return -ENOENT;
5522	if (child->mnt_parent == first)
5523	break;
5524	}
5525
5526	root->mnt = mntget(&child->mnt);
5527	root->dentry = dget(dentry: root->mnt->mnt_root);
5528	return `0`;
5529	}
5530
5531	/ This must be updated whenever a new flag is added /
5532	#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC \| \
5533	STATMOUNT_MNT_BASIC \| \
5534	STATMOUNT_PROPAGATE_FROM \| \
5535	STATMOUNT_MNT_ROOT \| \
5536	STATMOUNT_MNT_POINT \| \
5537	STATMOUNT_FS_TYPE \| \
5538	STATMOUNT_MNT_NS_ID \| \
5539	STATMOUNT_MNT_OPTS \| \
5540	STATMOUNT_FS_SUBTYPE \| \
5541	STATMOUNT_SB_SOURCE \| \
5542	STATMOUNT_OPT_ARRAY \| \
5543	STATMOUNT_OPT_SEC_ARRAY \| \
5544	STATMOUNT_SUPPORTED_MASK \| \
5545	STATMOUNT_MNT_UIDMAP \| \
5546	STATMOUNT_MNT_GIDMAP)
5547
5548	/ locks: namespace_shared /
5549	static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
5550	struct mnt_namespace *ns)
5551	{
5552	struct mount *m;
5553	int err;
5554
5555	/ Has the namespace already been emptied? /
5556	if (mnt_ns_id && mnt_ns_empty(ns))
5557	return -ENOENT;
5558
5559	s->mnt = lookup_mnt_in_ns(id: mnt_id, ns);
5560	if (!s->mnt)
5561	return -ENOENT;
5562
5563	err = grab_requested_root(ns, root: &s->root);
5564	if (err)
5565	return err;
5566
5567	/*
5568	* Don't trigger audit denials. We just want to determine what
5569	* mounts to show users.
5570	*/
5571	m = real_mount(mnt: s->mnt);
5572	if (!is_path_reachable(mnt: m, dentry: m->mnt.mnt_root, root: &s->root) &&
5573	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
5574	return -EPERM;
5575
5576	err = security_sb_statfs(dentry: s->mnt->mnt_root);
5577	if (err)
5578	return err;
5579
5580	/*
5581	* Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
5582	* can change concurrently as we only hold the read-side of the
5583	* namespace semaphore and mount properties may change with only
5584	* the mount lock held.
5585	*
5586	* We could sample the mount lock sequence counter to detect
5587	* those changes and retry. But it's not worth it. Worst that
5588	* happens is that the mnt->mnt_idmap pointer is already changed
5589	* while mnt->mnt_flags isn't or vica versa. So what.
5590	*
5591	* Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
5592	* via READ_ONCE()/WRITE_ONCE() and guard against theoretical
5593	* torn read/write. That's all we care about right now.
5594	*/
5595	s->idmap = mnt_idmap(mnt: s->mnt);
5596	if (s->mask & STATMOUNT_MNT_BASIC)
5597	statmount_mnt_basic(s);
5598
5599	if (s->mask & STATMOUNT_SB_BASIC)
5600	statmount_sb_basic(s);
5601
5602	if (s->mask & STATMOUNT_PROPAGATE_FROM)
5603	statmount_propagate_from(s);
5604
5605	if (s->mask & STATMOUNT_FS_TYPE)
5606	err = statmount_string(s, STATMOUNT_FS_TYPE);
5607
5608	if (!err && s->mask & STATMOUNT_MNT_ROOT)
5609	err = statmount_string(s, STATMOUNT_MNT_ROOT);
5610
5611	if (!err && s->mask & STATMOUNT_MNT_POINT)
5612	err = statmount_string(s, STATMOUNT_MNT_POINT);
5613
5614	if (!err && s->mask & STATMOUNT_MNT_OPTS)
5615	err = statmount_string(s, STATMOUNT_MNT_OPTS);
5616
5617	if (!err && s->mask & STATMOUNT_OPT_ARRAY)
5618	err = statmount_string(s, STATMOUNT_OPT_ARRAY);
5619
5620	if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
5621	err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
5622
5623	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
5624	err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
5625
5626	if (!err && s->mask & STATMOUNT_SB_SOURCE)
5627	err = statmount_string(s, STATMOUNT_SB_SOURCE);
5628
5629	if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
5630	err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
5631
5632	if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
5633	err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
5634
5635	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
5636	statmount_mnt_ns_id(s, ns);
5637
5638	if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
5639	s->sm.mask \|= STATMOUNT_SUPPORTED_MASK;
5640	s->sm.supported_mask = STATMOUNT_SUPPORTED;
5641	}
5642
5643	if (err)
5644	return err;
5645
5646	/ Are there bits in the return mask not present in STATMOUNT_SUPPORTED? /
5647	WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
5648
5649	return `0`;
5650	}
5651
5652	static inline bool retry_statmount(const long ret, size_t *seq_size)
5653	{
5654	if (likely(ret != -EAGAIN))
5655	return false;
5656	if (unlikely(check_mul_overflow(*seq_size, `2`, seq_size)))
5657	return false;
5658	if (unlikely(*seq_size > MAX_RW_COUNT))
5659	return false;
5660	return true;
5661	}
5662
5663	#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT \| STATMOUNT_MNT_POINT \| \
5664	STATMOUNT_FS_TYPE \| STATMOUNT_MNT_OPTS \| \
5665	STATMOUNT_FS_SUBTYPE \| STATMOUNT_SB_SOURCE \| \
5666	STATMOUNT_OPT_ARRAY \| STATMOUNT_OPT_SEC_ARRAY \| \
5667	STATMOUNT_MNT_UIDMAP \| STATMOUNT_MNT_GIDMAP)
5668
5669	static int prepare_kstatmount(struct kstatmount ks, struct* mnt_id_req *kreq,
5670	struct statmount __user *buf, size_t bufsize,
5671	size_t seq_size)
5672	{
5673	if (!access_ok(buf, bufsize))
5674	return -EFAULT;
5675
5676	memset(ks, `0`, sizeof(*ks));
5677	ks->mask = kreq->param;
5678	ks->buf = buf;
5679	ks->bufsize = bufsize;
5680
5681	if (ks->mask & STATMOUNT_STRING_REQ) {
5682	if (bufsize == sizeof(ks->sm))
5683	return -EOVERFLOW;
5684
5685	ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
5686	if (!ks->seq.buf)
5687	return -ENOMEM;
5688
5689	ks->seq.size = seq_size;
5690	}
5691
5692	return `0`;
5693	}
5694
5695	static int copy_mnt_id_req(const struct mnt_id_req __user *req,
5696	struct mnt_id_req *kreq)
5697	{
5698	int ret;
5699	size_t usize;
5700
5701	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
5702
5703	ret = get_user(usize, &req->size);
5704	if (ret)
5705	return -EFAULT;
5706	if (unlikely(usize > PAGE_SIZE))
5707	return -E2BIG;
5708	if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
5709	return -EINVAL;
5710	memset(kreq, `0`, sizeof(*kreq));
5711	ret = copy_struct_from_user(dst: kreq, ksize: sizeof(*kreq), src: req, usize);
5712	if (ret)
5713	return ret;
5714	if (kreq->mnt_ns_fd != `0` && kreq->mnt_ns_id)
5715	return -EINVAL;
5716	/ The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. /
5717	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
5718	return -EINVAL;
5719	return `0`;
5720	}
5721
5722	/*
5723	* If the user requested a specific mount namespace id, look that up and return
5724	* that, or if not simply grab a passive reference on our mount namespace and
5725	* return that.
5726	*/
5727	static struct mnt_namespace grab_requested_mnt_ns(const* struct mnt_id_req *kreq)
5728	{
5729	struct mnt_namespace *mnt_ns;
5730
5731	if (kreq->mnt_ns_id) {
5732	mnt_ns = lookup_mnt_ns(mnt_ns_id: kreq->mnt_ns_id);
5733	if (!mnt_ns)
5734	return ERR_PTR(error: -ENOENT);
5735	} else if (kreq->mnt_ns_fd) {
5736	struct ns_common *ns;
5737
5738	CLASS(fd, f)(fd: kreq->mnt_ns_fd);
5739	if (fd_empty(f))
5740	return ERR_PTR(error: -EBADF);
5741
5742	if (!proc_ns_file(fd_file(f)))
5743	return ERR_PTR(error: -EINVAL);
5744
5745	ns = get_proc_ns(file_inode(fd_file(f)));
5746	if (ns->ns_type != CLONE_NEWNS)
5747	return ERR_PTR(error: -EINVAL);
5748
5749	mnt_ns = to_mnt_ns(ns);
5750	refcount_inc(r: &mnt_ns->passive);
5751	} else {
5752	mnt_ns = current->nsproxy->mnt_ns;
5753	refcount_inc(r: &mnt_ns->passive);
5754	}
5755
5756	return mnt_ns;
5757	}
5758
5759	SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
5760	struct statmount __user *, buf, size_t, bufsize,
5761	unsigned int, flags)
5762	{
5763	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
5764	struct kstatmount *ks __free(kfree) = NULL;
5765	struct mnt_id_req kreq;
5766	/ We currently support retrieval of 3 strings. /
5767	size_t seq_size = `3` * PATH_MAX;
5768	int ret;
5769
5770	if (flags)
5771	return -EINVAL;
5772
5773	ret = copy_mnt_id_req(req, kreq: &kreq);
5774	if (ret)
5775	return ret;
5776
5777	ns = grab_requested_mnt_ns(kreq: &kreq);
5778	if (IS_ERR(ptr: ns))
5779	return PTR_ERR(ptr: ns);
5780
5781	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
5782	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
5783	return -ENOENT;
5784
5785	ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
5786	if (!ks)
5787	return -ENOMEM;
5788
5789	retry:
5790	ret = prepare_kstatmount(ks, kreq: &kreq, buf, bufsize, seq_size);
5791	if (ret)
5792	return ret;
5793
5794	scoped_guard(namespace_shared)
5795	ret = do_statmount(s: ks, mnt_id: kreq.mnt_id, mnt_ns_id: kreq.mnt_ns_id, ns);
5796
5797	if (!ret)
5798	ret = copy_statmount_to_user(s: ks);
5799	kvfree(addr: ks->seq.buf);
5800	path_put(&ks->root);
5801	if (retry_statmount(ret, seq_size: &seq_size))
5802	goto retry;
5803	return ret;
5804	}
5805
5806	struct klistmount {
5807	u64 last_mnt_id;
5808	u64 mnt_parent_id;
5809	u64 *kmnt_ids;
5810	u32 nr_mnt_ids;
5811	struct mnt_namespace *ns;
5812	struct path root;
5813	};
5814
5815	/ locks: namespace_shared /
5816	static ssize_t do_listmount(struct klistmount *kls, bool reverse)
5817	{
5818	struct mnt_namespace *ns = kls->ns;
5819	u64 mnt_parent_id = kls->mnt_parent_id;
5820	u64 last_mnt_id = kls->last_mnt_id;
5821	u64 *mnt_ids = kls->kmnt_ids;
5822	size_t nr_mnt_ids = kls->nr_mnt_ids;
5823	struct path orig;
5824	struct mount r, first;
5825	ssize_t ret;
5826
5827	rwsem_assert_held(sem: &namespace_sem);
5828
5829	ret = grab_requested_root(ns, root: &kls->root);
5830	if (ret)
5831	return ret;
5832
5833	if (mnt_parent_id == LSMT_ROOT) {
5834	orig = kls->root;
5835	} else {
5836	orig.mnt = lookup_mnt_in_ns(id: mnt_parent_id, ns);
5837	if (!orig.mnt)
5838	return -ENOENT;
5839	orig.dentry = orig.mnt->mnt_root;
5840	}
5841
5842	/*
5843	* Don't trigger audit denials. We just want to determine what
5844	* mounts to show users.
5845	*/
5846	if (!is_path_reachable(mnt: real_mount(mnt: orig.mnt), dentry: orig.dentry, root: &kls->root) &&
5847	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
5848	return -EPERM;
5849
5850	ret = security_sb_statfs(dentry: orig.dentry);
5851	if (ret)
5852	return ret;
5853
5854	if (!last_mnt_id) {
5855	if (reverse)
5856	first = node_to_mount(node: ns->mnt_last_node);
5857	else
5858	first = node_to_mount(node: ns->mnt_first_node);
5859	} else {
5860	if (reverse)
5861	first = mnt_find_id_at_reverse(ns, mnt_id: last_mnt_id - `1`);
5862	else
5863	first = mnt_find_id_at(ns, mnt_id: last_mnt_id + `1`);
5864	}
5865
5866	for (ret = `0`, r = first; r && nr_mnt_ids; r = listmnt_next(curr: r, reverse)) {
5867	if (r->mnt_id_unique == mnt_parent_id)
5868	continue;
5869	if (!is_path_reachable(mnt: r, dentry: r->mnt.mnt_root, root: &orig))
5870	continue;
5871	*mnt_ids = r->mnt_id_unique;
5872	mnt_ids++;
5873	nr_mnt_ids--;
5874	ret++;
5875	}
5876	return ret;
5877	}
5878
5879	static void __free_klistmount_free(const struct klistmount *kls)
5880	{
5881	path_put(&kls->root);
5882	kvfree(addr: kls->kmnt_ids);
5883	mnt_ns_release(ns: kls->ns);
5884	}
5885
5886	static inline int prepare_klistmount(struct klistmount kls, struct* mnt_id_req *kreq,
5887	size_t nr_mnt_ids)
5888	{
5889	u64 last_mnt_id = kreq->param;
5890	struct mnt_namespace *ns;
5891
5892	/ The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. /
5893	if (last_mnt_id != `0` && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
5894	return -EINVAL;
5895
5896	kls->last_mnt_id = last_mnt_id;
5897
5898	kls->nr_mnt_ids = nr_mnt_ids;
5899	kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
5900	GFP_KERNEL_ACCOUNT);
5901	if (!kls->kmnt_ids)
5902	return -ENOMEM;
5903
5904	ns = grab_requested_mnt_ns(kreq);
5905	if (IS_ERR(ptr: ns))
5906	return PTR_ERR(ptr: ns);
5907	kls->ns = ns;
5908
5909	kls->mnt_parent_id = kreq->mnt_id;
5910	return `0`;
5911	}
5912
5913	SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
5914	u64 __user , mnt_ids, size_t, nr_mnt_ids, unsigned* int, flags)
5915	{
5916	struct klistmount kls __free(klistmount_free) = {};
5917	const size_t maxcount = `1000000`;
5918	struct mnt_id_req kreq;
5919	ssize_t ret;
5920
5921	if (flags & ~LISTMOUNT_REVERSE)
5922	return -EINVAL;
5923
5924	/*
5925	* If the mount namespace really has more than 1 million mounts the
5926	* caller must iterate over the mount namespace (and reconsider their
5927	* system design...).
5928	*/
5929	if (unlikely(nr_mnt_ids > maxcount))
5930	return -EOVERFLOW;
5931
5932	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
5933	return -EFAULT;
5934
5935	ret = copy_mnt_id_req(req, kreq: &kreq);
5936	if (ret)
5937	return ret;
5938
5939	ret = prepare_klistmount(kls: &kls, kreq: &kreq, nr_mnt_ids);
5940	if (ret)
5941	return ret;
5942
5943	if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
5944	!ns_capable_noaudit(ns: kls.ns->user_ns, CAP_SYS_ADMIN))
5945	return -ENOENT;
5946
5947	/*
5948	* We only need to guard against mount topology changes as
5949	* listmount() doesn't care about any mount properties.
5950	*/
5951	scoped_guard(namespace_shared)
5952	ret = do_listmount(kls: &kls, reverse: (flags & LISTMOUNT_REVERSE));
5953	if (ret <= `0`)
5954	return ret;
5955
5956	if (copy_to_user(to: mnt_ids, from: kls.kmnt_ids, n: ret * sizeof(*mnt_ids)))
5957	return -EFAULT;
5958
5959	return ret;
5960	}
5961
5962	struct mnt_namespace init_mnt_ns = {
5963	.ns = NS_COMMON_INIT(init_mnt_ns),
5964	.user_ns = &init_user_ns,
5965	.passive = REFCOUNT_INIT(`1`),
5966	.mounts = RB_ROOT,
5967	.poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
5968	};
5969
5970	static void __init init_mount_tree(void)
5971	{
5972	struct vfsmount *mnt;
5973	struct mount *m;
5974	struct path root;
5975
5976	mnt = vfs_kern_mount(&rootfs_fs_type, `0`, "rootfs", initramfs_options);
5977	if (IS_ERR(ptr: mnt))
5978	panic(fmt: "Can't create rootfs");
5979
5980	m = real_mount(mnt);
5981	init_mnt_ns.root = m;
5982	init_mnt_ns.nr_mounts = `1`;
5983	mnt_add_to_ns(ns: &init_mnt_ns, mnt: m);
5984	init_task.nsproxy->mnt_ns = &init_mnt_ns;
5985	get_mnt_ns(ns: &init_mnt_ns);
5986
5987	root.mnt = mnt;
5988	root.dentry = mnt->mnt_root;
5989
5990	set_fs_pwd(current->fs, &root);
5991	set_fs_root(current->fs, &root);
5992
5993	ns_tree_add(&init_mnt_ns);
5994	}
5995
5996	void __init mnt_init(void)
5997	{
5998	int err;
5999
6000	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
6001	`0`, SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT, NULL);
6002
6003	mount_hashtable = alloc_large_system_hash(tablename: "Mount-cache",
6004	bucketsize: sizeof(struct hlist_head),
6005	numentries: mhash_entries, scale: `19`,
6006	HASH_ZERO,
6007	hash_shift: &m_hash_shift, hash_mask: &m_hash_mask, low_limit: `0`, high_limit: `0`);
6008	mountpoint_hashtable = alloc_large_system_hash(tablename: "Mountpoint-cache",
6009	bucketsize: sizeof(struct hlist_head),
6010	numentries: mphash_entries, scale: `19`,
6011	HASH_ZERO,
6012	hash_shift: &mp_hash_shift, hash_mask: &mp_hash_mask, low_limit: `0`, high_limit: `0`);
6013
6014	if (!mount_hashtable \|\| !mountpoint_hashtable)
6015	panic(fmt: "Failed to allocate mount hash table\n");
6016
6017	kernfs_init();
6018
6019	err = sysfs_init();
6020	if (err)
6021	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
6022	__func__, err);
6023	fs_kobj = kobject_create_and_add(name: "fs", NULL);
6024	if (!fs_kobj)
6025	printk(KERN_WARNING "%s: kobj create error\n", __func__);
6026	shmem_init();
6027	init_rootfs();
6028	init_mount_tree();
6029	}
6030
6031	void put_mnt_ns(struct mnt_namespace *ns)
6032	{
6033	if (!ns_ref_put(ns))
6034	return;
6035	guard(namespace_excl)();
6036	emptied_ns = ns;
6037	guard(mount_writer)();
6038	umount_tree(mnt: ns->root, how: `0`);
6039	}
6040
6041	struct vfsmount kern_mount(struct* file_system_type *type)
6042	{
6043	struct vfsmount *mnt;
6044	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
6045	if (!IS_ERR(ptr: mnt)) {
6046	/*
6047	* it is a longterm mount, don't release mnt until
6048	* we unmount before file sys is unregistered
6049	*/
6050	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
6051	}
6052	return mnt;
6053	}
6054	EXPORT_SYMBOL_GPL(kern_mount);
6055
6056	void kern_unmount(struct vfsmount *mnt)
6057	{
6058	/ release long term mount so mount point can be released /
6059	if (!IS_ERR(ptr: mnt)) {
6060	mnt_make_shortterm(mnt);
6061	synchronize_rcu(); / yecchhh... /
6062	mntput(mnt);
6063	}
6064	}
6065	EXPORT_SYMBOL(kern_unmount);
6066
6067	void kern_unmount_array(struct vfsmount mnt[], unsigned* int num)
6068	{
6069	unsigned int i;
6070
6071	for (i = `0`; i < num; i++)
6072	mnt_make_shortterm(mnt: mnt[i]);
6073	synchronize_rcu_expedited();
6074	for (i = `0`; i < num; i++)
6075	mntput(mnt[i]);
6076	}
6077	EXPORT_SYMBOL(kern_unmount_array);
6078
6079	bool our_mnt(struct vfsmount *mnt)
6080	{
6081	return check_mnt(mnt: real_mount(mnt));
6082	}
6083
6084	bool current_chrooted(void)
6085	{
6086	/ Does the current process have a non-standard root /
6087	struct path fs_root __free(path_put) = {};
6088	struct mount *root;
6089
6090	get_fs_root(current->fs, root: &fs_root);
6091
6092	/ Find the namespace root /
6093
6094	guard(mount_locked_reader)();
6095
6096	root = topmost_overmount(current->nsproxy->mnt_ns->root);
6097
6098	return fs_root.mnt != &root->mnt \|\| !path_mounted(path: &fs_root);
6099	}
6100
6101	static bool mnt_already_visible(struct mnt_namespace *ns,
6102	const struct super_block *sb,
6103	int *new_mnt_flags)
6104	{
6105	int new_flags = *new_mnt_flags;
6106	struct mount mnt, n;
6107
6108	guard(namespace_shared)();
6109	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
6110	struct mount *child;
6111	int mnt_flags;
6112
6113	if (mnt->mnt.mnt_sb->s_type != sb->s_type)
6114	continue;
6115
6116	/ This mount is not fully visible if it's root directory*
6117	* is not the root directory of the filesystem.
6118	*/
6119	if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
6120	continue;
6121
6122	/ A local view of the mount flags /
6123	mnt_flags = mnt->mnt.mnt_flags;
6124
6125	/ Don't miss readonly hidden in the superblock flags /
6126	if (sb_rdonly(sb: mnt->mnt.mnt_sb))
6127	mnt_flags \|= MNT_LOCK_READONLY;
6128
6129	/ Verify the mount flags are equal to or more permissive*
6130	* than the proposed new mount.
6131	*/
6132	if ((mnt_flags & MNT_LOCK_READONLY) &&
6133	!(new_flags & MNT_READONLY))
6134	continue;
6135	if ((mnt_flags & MNT_LOCK_ATIME) &&
6136	((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
6137	continue;
6138
6139	/ This mount is not fully visible if there are any*
6140	* locked child mounts that cover anything except for
6141	* empty directories.
6142	*/
6143	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
6144	struct inode *inode = child->mnt_mountpoint->d_inode;
6145	/ Only worry about locked mounts /
6146	if (!(child->mnt.mnt_flags & MNT_LOCKED))
6147	continue;
6148	/ Is the directory permanently empty? /
6149	if (!is_empty_dir_inode(inode))
6150	goto next;
6151	}
6152	/ Preserve the locked attributes /
6153	*new_mnt_flags \|= mnt_flags & (MNT_LOCK_READONLY \| \
6154	MNT_LOCK_ATIME);
6155	return true;
6156	next: ;
6157	}
6158	return false;
6159	}
6160
6161	static bool mount_too_revealing(const struct super_block sb, int* *new_mnt_flags)
6162	{
6163	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
6164	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
6165	unsigned long s_iflags;
6166
6167	if (ns->user_ns == &init_user_ns)
6168	return false;
6169
6170	/ Can this filesystem be too revealing? /
6171	s_iflags = sb->s_iflags;
6172	if (!(s_iflags & SB_I_USERNS_VISIBLE))
6173	return false;
6174
6175	if ((s_iflags & required_iflags) != required_iflags) {
6176	WARN_ONCE(`1`, "Expected s_iflags to contain 0x%lx\n",
6177	required_iflags);
6178	return true;
6179	}
6180
6181	return !mnt_already_visible(ns, sb, new_mnt_flags);
6182	}
6183
6184	bool mnt_may_suid(struct vfsmount *mnt)
6185	{
6186	/*
6187	* Foreign mounts (accessed via fchdir or through /proc
6188	* symlinks) are always treated as if they are nosuid. This
6189	* prevents namespaces from trusting potentially unsafe
6190	* suid/sgid bits, file caps, or security labels that originate
6191	* in other namespaces.
6192	*/
6193	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(mnt: real_mount(mnt)) &&
6194	current_in_userns(target_ns: mnt->mnt_sb->s_user_ns);
6195	}
6196
6197	static struct ns_common mntns_get(struct* task_struct *task)
6198	{
6199	struct ns_common *ns = NULL;
6200	struct nsproxy *nsproxy;
6201
6202	task_lock(p: task);
6203	nsproxy = task->nsproxy;
6204	if (nsproxy) {
6205	ns = &nsproxy->mnt_ns->ns;
6206	get_mnt_ns(ns: to_mnt_ns(ns));
6207	}
6208	task_unlock(p: task);
6209
6210	return ns;
6211	}
6212
6213	static void mntns_put(struct ns_common *ns)
6214	{
6215	put_mnt_ns(ns: to_mnt_ns(ns));
6216	}
6217
6218	static int mntns_install(struct nsset nsset, struct* ns_common *ns)
6219	{
6220	struct nsproxy *nsproxy = nsset->nsproxy;
6221	struct fs_struct *fs = nsset->fs;
6222	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
6223	struct user_namespace *user_ns = nsset->cred->user_ns;
6224	struct path root;
6225	int err;
6226
6227	if (!ns_capable(ns: mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
6228	!ns_capable(ns: user_ns, CAP_SYS_CHROOT) \|\|
6229	!ns_capable(ns: user_ns, CAP_SYS_ADMIN))
6230	return -EPERM;
6231
6232	if (is_anon_ns(ns: mnt_ns))
6233	return -EINVAL;
6234
6235	if (fs->users != `1`)
6236	return -EINVAL;
6237
6238	get_mnt_ns(ns: mnt_ns);
6239	old_mnt_ns = nsproxy->mnt_ns;
6240	nsproxy->mnt_ns = mnt_ns;
6241
6242	/ Find the root /
6243	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
6244	"/", LOOKUP_DOWN, &root);
6245	if (err) {
6246	/ revert to old namespace /
6247	nsproxy->mnt_ns = old_mnt_ns;
6248	put_mnt_ns(ns: mnt_ns);
6249	return err;
6250	}
6251
6252	put_mnt_ns(ns: old_mnt_ns);
6253
6254	/ Update the pwd and root /
6255	set_fs_pwd(fs, &root);
6256	set_fs_root(fs, &root);
6257
6258	path_put(&root);
6259	return `0`;
6260	}
6261
6262	static struct user_namespace mntns_owner(struct* ns_common *ns)
6263	{
6264	return to_mnt_ns(ns)->user_ns;
6265	}
6266
6267	const struct proc_ns_operations mntns_operations = {
6268	.name = "mnt",
6269	.get = mntns_get,
6270	.put = mntns_put,
6271	.install = mntns_install,
6272	.owner = mntns_owner,
6273	};
6274
6275	#ifdef CONFIG_SYSCTL
6276	static const struct ctl_table fs_namespace_sysctls[] = {
6277	{
6278	.procname = "mount-max",
6279	.data = &sysctl_mount_max,
6280	.maxlen = sizeof(unsigned int),
6281	.mode = `0644`,
6282	.proc_handler = proc_dointvec_minmax,
6283	.extra1 = SYSCTL_ONE,
6284	},
6285	};
6286
6287	static int __init init_fs_namespace_sysctls(void)
6288	{
6289	register_sysctl_init("fs", fs_namespace_sysctls);
6290	return `0`;
6291	}
6292	fs_initcall(init_fs_namespace_sysctls);
6293
6294	#endif /* CONFIG_SYSCTL */
6295

source code of linux/fs/namespace.c