userfaultfd.c source code [linux/fs/userfaultfd.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* fs/userfaultfd.c
4	*
5	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6	* Copyright (C) 2008-2009 Red Hat, Inc.
7	* Copyright (C) 2015 Red Hat, Inc.
8	*
9	* Some part derived from fs/eventfd.c (anon inode setup) and
10	* mm/ksm.c (mm hashing).
11	*/
12
13	#include <linux/list.h>
14	#include <linux/hashtable.h>
15	#include <linux/sched/signal.h>
16	#include <linux/sched/mm.h>
17	#include <linux/mm.h>
18	#include <linux/mm_inline.h>
19	#include <linux/mmu_notifier.h>
20	#include <linux/poll.h>
21	#include <linux/slab.h>
22	#include <linux/seq_file.h>
23	#include <linux/file.h>
24	#include <linux/bug.h>
25	#include <linux/anon_inodes.h>
26	#include <linux/syscalls.h>
27	#include <linux/userfaultfd_k.h>
28	#include <linux/mempolicy.h>
29	#include <linux/ioctl.h>
30	#include <linux/security.h>
31	#include <linux/hugetlb.h>
32	#include <linux/leafops.h>
33	#include <linux/miscdevice.h>
34	#include <linux/uio.h>
35
36	static int sysctl_unprivileged_userfaultfd __read_mostly;
37
38	#ifdef CONFIG_SYSCTL
39	static const struct ctl_table vm_userfaultfd_table[] = {
40	{
41	.procname = "unprivileged_userfaultfd",
42	.data = &sysctl_unprivileged_userfaultfd,
43	.maxlen = sizeof(sysctl_unprivileged_userfaultfd),
44	.mode = `0644`,
45	.proc_handler = proc_dointvec_minmax,
46	.extra1 = SYSCTL_ZERO,
47	.extra2 = SYSCTL_ONE,
48	},
49	};
50	#endif
51
52	static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
53
54	struct userfaultfd_fork_ctx {
55	struct userfaultfd_ctx *orig;
56	struct userfaultfd_ctx *new;
57	struct list_head list;
58	};
59
60	struct userfaultfd_unmap_ctx {
61	struct userfaultfd_ctx *ctx;
62	unsigned long start;
63	unsigned long end;
64	struct list_head list;
65	};
66
67	struct userfaultfd_wait_queue {
68	struct uffd_msg msg;
69	wait_queue_entry_t wq;
70	struct userfaultfd_ctx *ctx;
71	bool waken;
72	};
73
74	struct userfaultfd_wake_range {
75	unsigned long start;
76	unsigned long len;
77	};
78
79	/ internal indication that UFFD_API ioctl was successfully executed /
80	#define UFFD_FEATURE_INITIALIZED (1u << 31)
81
82	static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
83	{
84	return ctx->features & UFFD_FEATURE_INITIALIZED;
85	}
86
87	static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
88	{
89	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
90	}
91
92	/*
93	* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
94	* meaningful when userfaultfd_wp()==true on the vma and when it's
95	* anonymous.
96	*/
97	bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
98	{
99	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
100
101	if (!ctx)
102	return false;
103
104	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
105	}
106
107	static int userfaultfd_wake_function(wait_queue_entry_t wq, unsigned* mode,
108	int wake_flags, void *key)
109	{
110	struct userfaultfd_wake_range *range = key;
111	int ret;
112	struct userfaultfd_wait_queue *uwq;
113	unsigned long start, len;
114
115	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
116	ret = `0`;
117	/ len == 0 means wake all /
118	start = range->start;
119	len = range->len;
120	if (len && (start > uwq->msg.arg.pagefault.address \|\|
121	start + len <= uwq->msg.arg.pagefault.address))
122	goto out;
123	WRITE_ONCE(uwq->waken, true);
124	/*
125	* The Program-Order guarantees provided by the scheduler
126	* ensure uwq->waken is visible before the task is woken.
127	*/
128	ret = wake_up_state(tsk: wq->private, state: mode);
129	if (ret) {
130	/*
131	* Wake only once, autoremove behavior.
132	*
133	* After the effect of list_del_init is visible to the other
134	* CPUs, the waitqueue may disappear from under us, see the
135	* !list_empty_careful() in handle_userfault().
136	*
137	* try_to_wake_up() has an implicit smp_mb(), and the
138	* wq->private is read before calling the extern function
139	* "wake_up_state" (which in turns calls try_to_wake_up).
140	*/
141	list_del_init(entry: &wq->entry);
142	}
143	out:
144	return ret;
145	}
146
147	/**
148	* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
149	* context.
150	* @ctx: [in] Pointer to the userfaultfd context.
151	*/
152	static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
153	{
154	refcount_inc(r: &ctx->refcount);
155	}
156
157	/**
158	* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
159	* context.
160	* @ctx: [in] Pointer to userfaultfd context.
161	*
162	* The userfaultfd context reference must have been previously acquired either
163	* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
164	*/
165	static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
166	{
167	if (refcount_dec_and_test(r: &ctx->refcount)) {
168	VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
169	VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
170	VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
171	VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
172	VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
173	VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
174	VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
175	VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
176	mmdrop(mm: ctx->mm);
177	kmem_cache_free(s: userfaultfd_ctx_cachep, objp: ctx);
178	}
179	}
180
181	static inline void msg_init(struct uffd_msg *msg)
182	{
183	BUILD_BUG_ON(sizeof(struct uffd_msg) != `32`);
184	/*
185	* Must use memset to zero out the paddings or kernel data is
186	* leaked to userland.
187	*/
188	memset(msg, `0`, sizeof(struct uffd_msg));
189	}
190
191	static inline struct uffd_msg userfault_msg(unsigned long address,
192	unsigned long real_address,
193	unsigned int flags,
194	unsigned long reason,
195	unsigned int features)
196	{
197	struct uffd_msg msg;
198
199	msg_init(msg: &msg);
200	msg.event = UFFD_EVENT_PAGEFAULT;
201
202	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
203	real_address : address;
204
205	/*
206	* These flags indicate why the userfault occurred:
207	* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
208	* - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
209	* - Neither of these flags being set indicates a MISSING fault.
210	*
211	* Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
212	* fault. Otherwise, it was a read fault.
213	*/
214	if (flags & FAULT_FLAG_WRITE)
215	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WRITE;
216	if (reason & VM_UFFD_WP)
217	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WP;
218	if (reason & VM_UFFD_MINOR)
219	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_MINOR;
220	if (features & UFFD_FEATURE_THREAD_ID)
221	msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
222	return msg;
223	}
224
225	#ifdef CONFIG_HUGETLB_PAGE
226	/*
227	* Same functionality as userfaultfd_must_wait below with modifications for
228	* hugepmd ranges.
229	*/
230	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
231	struct vm_fault *vmf,
232	unsigned long reason)
233	{
234	struct vm_area_struct *vma = vmf->vma;
235	pte_t *ptep, pte;
236
237	assert_fault_locked(vmf);
238
239	ptep = hugetlb_walk(vma, addr: vmf->address, sz: vma_mmu_pagesize(vma));
240	if (!ptep)
241	return true;
242
243	pte = huge_ptep_get(mm: vma->vm_mm, addr: vmf->address, ptep);
244
245	/*
246	* Lockless access: we're in a wait_event so it's ok if it
247	* changes under us.
248	*/
249
250	/ Entry is still missing, wait for userspace to resolve the fault. /
251	if (huge_pte_none(pte))
252	return true;
253	/ UFFD PTE markers require userspace to resolve the fault. /
254	if (pte_is_uffd_marker(pte))
255	return true;
256	/*
257	* If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
258	* resolve the fault.
259	*/
260	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
261	return true;
262
263	return false;
264	}
265	#else
266	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
267	struct vm_fault *vmf,
268	unsigned long reason)
269	{
270	/ Should never get here. /
271	VM_WARN_ON_ONCE(`1`);
272	return false;
273	}
274	#endif /* CONFIG_HUGETLB_PAGE */
275
276	/*
277	* Verify the pagetables are still not ok after having registered into
278	* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
279	* userfault that has already been resolved, if userfaultfd_read_iter and
280	* UFFDIO_COPY\|ZEROPAGE are being run simultaneously on two different
281	* threads.
282	*/
283	static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
284	struct vm_fault *vmf,
285	unsigned long reason)
286	{
287	struct mm_struct *mm = ctx->mm;
288	unsigned long address = vmf->address;
289	pgd_t *pgd;
290	p4d_t *p4d;
291	pud_t *pud;
292	pmd_t *pmd, _pmd;
293	pte_t *pte;
294	pte_t ptent;
295	bool ret;
296
297	assert_fault_locked(vmf);
298
299	pgd = pgd_offset(mm, address);
300	if (!pgd_present(pgd: *pgd))
301	return true;
302	p4d = p4d_offset(pgd, address);
303	if (!p4d_present(p4d: *p4d))
304	return true;
305	pud = pud_offset(p4d, address);
306	if (!pud_present(pud: *pud))
307	return true;
308	pmd = pmd_offset(pud, address);
309	again:
310	_pmd = pmdp_get_lockless(pmdp: pmd);
311	if (pmd_none(pmd: _pmd))
312	return true;
313
314	/*
315	* A race could arise which would result in a softleaf entry such as
316	* migration entry unexpectedly being present in the PMD, so explicitly
317	* check for this and bail out if so.
318	*/
319	if (!pmd_present(pmd: _pmd))
320	return false;
321
322	if (pmd_trans_huge(pmd: _pmd))
323	return !pmd_write(pmd: _pmd) && (reason & VM_UFFD_WP);
324
325	pte = pte_offset_map(pmd, addr: address);
326	if (!pte)
327	goto again;
328
329	/*
330	* Lockless access: we're in a wait_event so it's ok if it
331	* changes under us.
332	*/
333	ptent = ptep_get(ptep: pte);
334
335	ret = true;
336	/ Entry is still missing, wait for userspace to resolve the fault. /
337	if (pte_none(pte: ptent))
338	goto out;
339	/ UFFD PTE markers require userspace to resolve the fault. /
340	if (pte_is_uffd_marker(pte: ptent))
341	goto out;
342	/*
343	* If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
344	* resolve the fault.
345	*/
346	if (!pte_write(pte: ptent) && (reason & VM_UFFD_WP))
347	goto out;
348
349	ret = false;
350	out:
351	pte_unmap(pte);
352	return ret;
353	}
354
355	static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
356	{
357	if (flags & FAULT_FLAG_INTERRUPTIBLE)
358	return TASK_INTERRUPTIBLE;
359
360	if (flags & FAULT_FLAG_KILLABLE)
361	return TASK_KILLABLE;
362
363	return TASK_UNINTERRUPTIBLE;
364	}
365
366	/*
367	* The locking rules involved in returning VM_FAULT_RETRY depending on
368	* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
369	* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
370	* recommendation in __lock_page_or_retry is not an understatement.
371	*
372	* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
373	* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
374	* not set.
375	*
376	* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
377	* set, VM_FAULT_RETRY can still be returned if and only if there are
378	* fatal_signal_pending()s, and the mmap_lock must be released before
379	* returning it.
380	*/
381	vm_fault_t handle_userfault(struct vm_fault vmf, unsigned* long reason)
382	{
383	struct vm_area_struct *vma = vmf->vma;
384	struct mm_struct *mm = vma->vm_mm;
385	struct userfaultfd_ctx *ctx;
386	struct userfaultfd_wait_queue uwq;
387	vm_fault_t ret = VM_FAULT_SIGBUS;
388	bool must_wait;
389	unsigned int blocking_state;
390
391	/*
392	* We don't do userfault handling for the final child pid update
393	* and when coredumping (faults triggered by get_dump_page()).
394	*/
395	if (current->flags & (PF_EXITING\|PF_DUMPCORE))
396	goto out;
397
398	assert_fault_locked(vmf);
399
400	ctx = vma->vm_userfaultfd_ctx.ctx;
401	if (!ctx)
402	goto out;
403
404	VM_WARN_ON_ONCE(ctx->mm != mm);
405
406	/ Any unrecognized flag is a bug. /
407	VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
408	/ 0 or > 1 flags set is a bug; we expect exactly 1. /
409	VM_WARN_ON_ONCE(!reason \|\| (reason & (reason - `1`)));
410
411	if (ctx->features & UFFD_FEATURE_SIGBUS)
412	goto out;
413	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
414	goto out;
415
416	/*
417	* Check that we can return VM_FAULT_RETRY.
418	*
419	* NOTE: it should become possible to return VM_FAULT_RETRY
420	* even if FAULT_FLAG_TRIED is set without leading to gup()
421	* -EBUSY failures, if the userfaultfd is to be extended for
422	* VM_UFFD_WP tracking and we intend to arm the userfault
423	* without first stopping userland access to the memory. For
424	* VM_UFFD_MISSING userfaults this is enough for now.
425	*/
426	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
427	/*
428	* Validate the invariant that nowait must allow retry
429	* to be sure not to return SIGBUS erroneously on
430	* nowait invocations.
431	*/
432	VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
433	#ifdef CONFIG_DEBUG_VM
434	if (printk_ratelimit()) {
435	pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
436	vmf->flags);
437	dump_stack();
438	}
439	#endif
440	goto out;
441	}
442
443	/*
444	* Handle nowait, not much to do other than tell it to retry
445	* and wait.
446	*/
447	ret = VM_FAULT_RETRY;
448	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
449	goto out;
450
451	if (unlikely(READ_ONCE(ctx->released))) {
452	/*
453	* If a concurrent release is detected, do not return
454	* VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
455	* return VM_FAULT_RETRY with lock released proactively.
456	*
457	* If we were to return VM_FAULT_SIGBUS here, the non
458	* cooperative manager would be instead forced to
459	* always call UFFDIO_UNREGISTER before it can safely
460	* close the uffd, to avoid involuntary SIGBUS triggered.
461	*
462	* If we were to return VM_FAULT_NOPAGE, it would work for
463	* the fault path, in which the lock will be released
464	* later. However for GUP, faultin_page() does nothing
465	* special on NOPAGE, so GUP would spin retrying without
466	* releasing the mmap read lock, causing possible livelock.
467	*
468	* Here only VM_FAULT_RETRY would make sure the mmap lock
469	* be released immediately, so that the thread concurrently
470	* releasing the userfault would always make progress.
471	*/
472	release_fault_lock(vmf);
473	goto out;
474	}
475
476	/ take the reference before dropping the mmap_lock /
477	userfaultfd_ctx_get(ctx);
478
479	init_waitqueue_func_entry(wq_entry: &uwq.wq, func: userfaultfd_wake_function);
480	uwq.wq.private = current;
481	uwq.msg = userfault_msg(address: vmf->address, real_address: vmf->real_address, flags: vmf->flags,
482	reason, features: ctx->features);
483	uwq.ctx = ctx;
484	uwq.waken = false;
485
486	blocking_state = userfaultfd_get_blocking_state(flags: vmf->flags);
487
488	/*
489	* Take the vma lock now, in order to safely call
490	* userfaultfd_huge_must_wait() later. Since acquiring the
491	* (sleepable) vma lock can modify the current task state, that
492	* must be before explicitly calling set_current_state().
493	*/
494	if (is_vm_hugetlb_page(vma))
495	hugetlb_vma_lock_read(vma);
496
497	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
498	/*
499	* After the __add_wait_queue the uwq is visible to userland
500	* through poll/read().
501	*/
502	__add_wait_queue(wq_head: &ctx->fault_pending_wqh, wq_entry: &uwq.wq);
503	/*
504	* The smp_mb() after __set_current_state prevents the reads
505	* following the spin_unlock to happen before the list_add in
506	* __add_wait_queue.
507	*/
508	set_current_state(blocking_state);
509	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
510
511	if (is_vm_hugetlb_page(vma)) {
512	must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
513	hugetlb_vma_unlock_read(vma);
514	} else {
515	must_wait = userfaultfd_must_wait(ctx, vmf, reason);
516	}
517
518	release_fault_lock(vmf);
519
520	if (likely(must_wait && !READ_ONCE(ctx->released))) {
521	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
522	schedule();
523	}
524
525	__set_current_state(TASK_RUNNING);
526
527	/*
528	* Here we race with the list_del; list_add in
529	* userfaultfd_ctx_read(), however because we don't ever run
530	* list_del_init() to refile across the two lists, the prev
531	* and next pointers will never point to self. list_add also
532	* would never let any of the two pointers to point to
533	* self. So list_empty_careful won't risk to see both pointers
534	* pointing to self at any time during the list refile. The
535	* only case where list_del_init() is called is the full
536	* removal in the wake function and there we don't re-list_add
537	* and it's fine not to block on the spinlock. The uwq on this
538	* kernel stack can be released after the list_del_init.
539	*/
540	if (!list_empty_careful(head: &uwq.wq.entry)) {
541	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
542	/*
543	* No need of list_del_init(), the uwq on the stack
544	* will be freed shortly anyway.
545	*/
546	list_del(entry: &uwq.wq.entry);
547	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
548	}
549
550	/*
551	* ctx may go away after this if the userfault pseudo fd is
552	* already released.
553	*/
554	userfaultfd_ctx_put(ctx);
555
556	out:
557	return ret;
558	}
559
560	static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
561	struct userfaultfd_wait_queue *ewq)
562	{
563	struct userfaultfd_ctx *release_new_ctx;
564
565	if (WARN_ON_ONCE(current->flags & PF_EXITING))
566	goto out;
567
568	ewq->ctx = ctx;
569	init_waitqueue_entry(wq_entry: &ewq->wq, current);
570	release_new_ctx = NULL;
571
572	spin_lock_irq(lock: &ctx->event_wqh.lock);
573	/*
574	* After the __add_wait_queue the uwq is visible to userland
575	* through poll/read().
576	*/
577	__add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
578	for (;;) {
579	set_current_state(TASK_KILLABLE);
580	if (ewq->msg.event == `0`)
581	break;
582	if (READ_ONCE(ctx->released) \|\|
583	fatal_signal_pending(current)) {
584	/*
585	* &ewq->wq may be queued in fork_event, but
586	* __remove_wait_queue ignores the head
587	* parameter. It would be a problem if it
588	* didn't.
589	*/
590	__remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
591	if (ewq->msg.event == UFFD_EVENT_FORK) {
592	struct userfaultfd_ctx *new;
593
594	new = (struct userfaultfd_ctx *)
595	(unsigned long)
596	ewq->msg.arg.reserved.reserved1;
597	release_new_ctx = new;
598	}
599	break;
600	}
601
602	spin_unlock_irq(lock: &ctx->event_wqh.lock);
603
604	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
605	schedule();
606
607	spin_lock_irq(lock: &ctx->event_wqh.lock);
608	}
609	__set_current_state(TASK_RUNNING);
610	spin_unlock_irq(lock: &ctx->event_wqh.lock);
611
612	if (release_new_ctx) {
613	userfaultfd_release_new(ctx: release_new_ctx);
614	userfaultfd_ctx_put(ctx: release_new_ctx);
615	}
616
617	/*
618	* ctx may go away after this if the userfault pseudo fd is
619	* already released.
620	*/
621	out:
622	atomic_dec(v: &ctx->mmap_changing);
623	VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < `0`);
624	userfaultfd_ctx_put(ctx);
625	}
626
627	static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
628	struct userfaultfd_wait_queue *ewq)
629	{
630	ewq->msg.event = `0`;
631	wake_up_locked(&ctx->event_wqh);
632	__remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
633	}
634
635	int dup_userfaultfd(struct vm_area_struct vma, struct* list_head *fcs)
636	{
637	struct userfaultfd_ctx ctx = NULL, octx;
638	struct userfaultfd_fork_ctx *fctx;
639
640	octx = vma->vm_userfaultfd_ctx.ctx;
641	if (!octx)
642	return `0`;
643
644	if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
645	userfaultfd_reset_ctx(vma);
646	return `0`;
647	}
648
649	list_for_each_entry(fctx, fcs, list)
650	if (fctx->orig == octx) {
651	ctx = fctx->new;
652	break;
653	}
654
655	if (!ctx) {
656	fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
657	if (!fctx)
658	return -ENOMEM;
659
660	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
661	if (!ctx) {
662	kfree(objp: fctx);
663	return -ENOMEM;
664	}
665
666	refcount_set(r: &ctx->refcount, n: `1`);
667	ctx->flags = octx->flags;
668	ctx->features = octx->features;
669	ctx->released = false;
670	init_rwsem(&ctx->map_changing_lock);
671	atomic_set(v: &ctx->mmap_changing, i: `0`);
672	ctx->mm = vma->vm_mm;
673	mmgrab(mm: ctx->mm);
674
675	userfaultfd_ctx_get(ctx: octx);
676	down_write(sem: &octx->map_changing_lock);
677	atomic_inc(v: &octx->mmap_changing);
678	up_write(sem: &octx->map_changing_lock);
679	fctx->orig = octx;
680	fctx->new = ctx;
681	list_add_tail(new: &fctx->list, head: fcs);
682	}
683
684	vma->vm_userfaultfd_ctx.ctx = ctx;
685	return `0`;
686	}
687
688	static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
689	{
690	struct userfaultfd_ctx *ctx = fctx->orig;
691	struct userfaultfd_wait_queue ewq;
692
693	msg_init(msg: &ewq.msg);
694
695	ewq.msg.event = UFFD_EVENT_FORK;
696	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
697
698	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
699	}
700
701	void dup_userfaultfd_complete(struct list_head *fcs)
702	{
703	struct userfaultfd_fork_ctx fctx, n;
704
705	list_for_each_entry_safe(fctx, n, fcs, list) {
706	dup_fctx(fctx);
707	list_del(entry: &fctx->list);
708	kfree(objp: fctx);
709	}
710	}
711
712	void dup_userfaultfd_fail(struct list_head *fcs)
713	{
714	struct userfaultfd_fork_ctx fctx, n;
715
716	/*
717	* An error has occurred on fork, we will tear memory down, but have
718	* allocated memory for fctx's and raised reference counts for both the
719	* original and child contexts (and on the mm for each as a result).
720	*
721	* These would ordinarily be taken care of by a user handling the event,
722	* but we are no longer doing so, so manually clean up here.
723	*
724	* mm tear down will take care of cleaning up VMA contexts.
725	*/
726	list_for_each_entry_safe(fctx, n, fcs, list) {
727	struct userfaultfd_ctx *octx = fctx->orig;
728	struct userfaultfd_ctx *ctx = fctx->new;
729
730	atomic_dec(v: &octx->mmap_changing);
731	VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < `0`);
732	userfaultfd_ctx_put(ctx: octx);
733	userfaultfd_ctx_put(ctx);
734
735	list_del(entry: &fctx->list);
736	kfree(objp: fctx);
737	}
738	}
739
740	void mremap_userfaultfd_prep(struct vm_area_struct *vma,
741	struct vm_userfaultfd_ctx *vm_ctx)
742	{
743	struct userfaultfd_ctx *ctx;
744
745	ctx = vma->vm_userfaultfd_ctx.ctx;
746
747	if (!ctx)
748	return;
749
750	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
751	vm_ctx->ctx = ctx;
752	userfaultfd_ctx_get(ctx);
753	down_write(sem: &ctx->map_changing_lock);
754	atomic_inc(v: &ctx->mmap_changing);
755	up_write(sem: &ctx->map_changing_lock);
756	} else {
757	/ Drop uffd context if remap feature not enabled /
758	userfaultfd_reset_ctx(vma);
759	}
760	}
761
762	void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
763	unsigned long from, unsigned long to,
764	unsigned long len)
765	{
766	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
767	struct userfaultfd_wait_queue ewq;
768
769	if (!ctx)
770	return;
771
772	msg_init(msg: &ewq.msg);
773
774	ewq.msg.event = UFFD_EVENT_REMAP;
775	ewq.msg.arg.remap.from = from;
776	ewq.msg.arg.remap.to = to;
777	ewq.msg.arg.remap.len = len;
778
779	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
780	}
781
782	void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
783	{
784	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
785
786	if (!ctx)
787	return;
788
789	userfaultfd_ctx_put(ctx);
790	}
791
792	bool userfaultfd_remove(struct vm_area_struct *vma,
793	unsigned long start, unsigned long end)
794	{
795	struct mm_struct *mm = vma->vm_mm;
796	struct userfaultfd_ctx *ctx;
797	struct userfaultfd_wait_queue ewq;
798
799	ctx = vma->vm_userfaultfd_ctx.ctx;
800	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
801	return true;
802
803	userfaultfd_ctx_get(ctx);
804	down_write(sem: &ctx->map_changing_lock);
805	atomic_inc(v: &ctx->mmap_changing);
806	up_write(sem: &ctx->map_changing_lock);
807	mmap_read_unlock(mm);
808
809	msg_init(msg: &ewq.msg);
810
811	ewq.msg.event = UFFD_EVENT_REMOVE;
812	ewq.msg.arg.remove.start = start;
813	ewq.msg.arg.remove.end = end;
814
815	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
816
817	return false;
818	}
819
820	static bool has_unmap_ctx(struct userfaultfd_ctx ctx, struct* list_head *unmaps,
821	unsigned long start, unsigned long end)
822	{
823	struct userfaultfd_unmap_ctx *unmap_ctx;
824
825	list_for_each_entry(unmap_ctx, unmaps, list)
826	if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
827	unmap_ctx->end == end)
828	return true;
829
830	return false;
831	}
832
833	int userfaultfd_unmap_prep(struct vm_area_struct vma, unsigned* long start,
834	unsigned long end, struct list_head *unmaps)
835	{
836	struct userfaultfd_unmap_ctx *unmap_ctx;
837	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
838
839	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) \|\|
840	has_unmap_ctx(ctx, unmaps, start, end))
841	return `0`;
842
843	unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
844	if (!unmap_ctx)
845	return -ENOMEM;
846
847	userfaultfd_ctx_get(ctx);
848	down_write(sem: &ctx->map_changing_lock);
849	atomic_inc(v: &ctx->mmap_changing);
850	up_write(sem: &ctx->map_changing_lock);
851	unmap_ctx->ctx = ctx;
852	unmap_ctx->start = start;
853	unmap_ctx->end = end;
854	list_add_tail(new: &unmap_ctx->list, head: unmaps);
855
856	return `0`;
857	}
858
859	void userfaultfd_unmap_complete(struct mm_struct mm, struct* list_head *uf)
860	{
861	struct userfaultfd_unmap_ctx ctx, n;
862	struct userfaultfd_wait_queue ewq;
863
864	list_for_each_entry_safe(ctx, n, uf, list) {
865	msg_init(msg: &ewq.msg);
866
867	ewq.msg.event = UFFD_EVENT_UNMAP;
868	ewq.msg.arg.remove.start = ctx->start;
869	ewq.msg.arg.remove.end = ctx->end;
870
871	userfaultfd_event_wait_completion(ctx: ctx->ctx, ewq: &ewq);
872
873	list_del(entry: &ctx->list);
874	kfree(objp: ctx);
875	}
876	}
877
878	static int userfaultfd_release(struct inode inode, struct* file *file)
879	{
880	struct userfaultfd_ctx *ctx = file->private_data;
881	struct mm_struct *mm = ctx->mm;
882	/ len == 0 means wake all /
883	struct userfaultfd_wake_range range = { .len = `0`, };
884
885	WRITE_ONCE(ctx->released, true);
886
887	userfaultfd_release_all(mm, ctx);
888
889	/*
890	* After no new page faults can wait on this fault_*wqh, flush
891	* the last page faults that may have been already waiting on
892	* the fault_*wqh.
893	*/
894	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
895	__wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL, key: &range);
896	__wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: `1`, key: &range);
897	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
898
899	/ Flush pending events that may still wait on event_wqh /
900	wake_up_all(&ctx->event_wqh);
901
902	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
903	userfaultfd_ctx_put(ctx);
904	return `0`;
905	}
906
907	/ fault_pending_wqh.lock must be hold by the caller /
908	static inline struct userfaultfd_wait_queue *find_userfault_in(
909	wait_queue_head_t *wqh)
910	{
911	wait_queue_entry_t *wq;
912	struct userfaultfd_wait_queue *uwq;
913
914	lockdep_assert_held(&wqh->lock);
915
916	uwq = NULL;
917	if (!waitqueue_active(wq_head: wqh))
918	goto out;
919	/ walk in reverse to provide FIFO behavior to read userfaults /
920	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
921	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
922	out:
923	return uwq;
924	}
925
926	static inline struct userfaultfd_wait_queue *find_userfault(
927	struct userfaultfd_ctx *ctx)
928	{
929	return find_userfault_in(wqh: &ctx->fault_pending_wqh);
930	}
931
932	static inline struct userfaultfd_wait_queue *find_userfault_evt(
933	struct userfaultfd_ctx *ctx)
934	{
935	return find_userfault_in(wqh: &ctx->event_wqh);
936	}
937
938	static __poll_t userfaultfd_poll(struct file file, poll_table wait)
939	{
940	struct userfaultfd_ctx *ctx = file->private_data;
941	__poll_t ret;
942
943	poll_wait(filp: file, wait_address: &ctx->fd_wqh, p: wait);
944
945	if (!userfaultfd_is_initialized(ctx))
946	return EPOLLERR;
947
948	/*
949	* poll() never guarantees that read won't block.
950	* userfaults can be waken before they're read().
951	*/
952	if (unlikely(!(file->f_flags & O_NONBLOCK)))
953	return EPOLLERR;
954	/*
955	* lockless access to see if there are pending faults
956	* __pollwait last action is the add_wait_queue but
957	* the spin_unlock would allow the waitqueue_active to
958	* pass above the actual list_add inside
959	* add_wait_queue critical section. So use a full
960	* memory barrier to serialize the list_add write of
961	* add_wait_queue() with the waitqueue_active read
962	* below.
963	*/
964	ret = `0`;
965	smp_mb();
966	if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
967	ret = EPOLLIN;
968	else if (waitqueue_active(wq_head: &ctx->event_wqh))
969	ret = EPOLLIN;
970
971	return ret;
972	}
973
974	static const struct file_operations userfaultfd_fops;
975
976	static int resolve_userfault_fork(struct userfaultfd_ctx *new,
977	struct inode *inode,
978	struct uffd_msg *msg)
979	{
980	int fd;
981
982	fd = anon_inode_create_getfd(name: "[userfaultfd]", fops: &userfaultfd_fops, priv: new,
983	O_RDONLY \| (new->flags & UFFD_SHARED_FCNTL_FLAGS), context_inode: inode);
984	if (fd < `0`)
985	return fd;
986
987	msg->arg.reserved.reserved1 = `0`;
988	msg->arg.fork.ufd = fd;
989	return `0`;
990	}
991
992	static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx ctx, int* no_wait,
993	struct uffd_msg msg, struct* inode *inode)
994	{
995	ssize_t ret;
996	DECLARE_WAITQUEUE(wait, current);
997	struct userfaultfd_wait_queue *uwq;
998	/*
999	* Handling fork event requires sleeping operations, so
1000	* we drop the event_wqh lock, then do these ops, then
1001	* lock it back and wake up the waiter. While the lock is
1002	* dropped the ewq may go away so we keep track of it
1003	* carefully.
1004	*/
1005	LIST_HEAD(fork_event);
1006	struct userfaultfd_ctx *fork_nctx = NULL;
1007
1008	/ always take the fd_wqh lock before the fault_pending_wqh lock /
1009	spin_lock_irq(lock: &ctx->fd_wqh.lock);
1010	__add_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1011	for (;;) {
1012	set_current_state(TASK_INTERRUPTIBLE);
1013	spin_lock(lock: &ctx->fault_pending_wqh.lock);
1014	uwq = find_userfault(ctx);
1015	if (uwq) {
1016	/*
1017	* Use a seqcount to repeat the lockless check
1018	* in wake_userfault() to avoid missing
1019	* wakeups because during the refile both
1020	* waitqueue could become empty if this is the
1021	* only userfault.
1022	*/
1023	write_seqcount_begin(&ctx->refile_seq);
1024
1025	/*
1026	* The fault_pending_wqh.lock prevents the uwq
1027	* to disappear from under us.
1028	*
1029	* Refile this userfault from
1030	* fault_pending_wqh to fault_wqh, it's not
1031	* pending anymore after we read it.
1032	*
1033	* Use list_del() by hand (as
1034	* userfaultfd_wake_function also uses
1035	* list_del_init() by hand) to be sure nobody
1036	* changes __remove_wait_queue() to use
1037	* list_del_init() in turn breaking the
1038	* !list_empty_careful() check in
1039	* handle_userfault(). The uwq->wq.head list
1040	* must never be empty at any time during the
1041	* refile, or the waitqueue could disappear
1042	* from under us. The "wait_queue_head_t"
1043	* parameter of __remove_wait_queue() is unused
1044	* anyway.
1045	*/
1046	list_del(entry: &uwq->wq.entry);
1047	add_wait_queue(wq_head: &ctx->fault_wqh, wq_entry: &uwq->wq);
1048
1049	write_seqcount_end(&ctx->refile_seq);
1050
1051	/ careful to always initialize msg if ret == 0 /
1052	*msg = uwq->msg;
1053	spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1054	ret = `0`;
1055	break;
1056	}
1057	spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1058
1059	spin_lock(lock: &ctx->event_wqh.lock);
1060	uwq = find_userfault_evt(ctx);
1061	if (uwq) {
1062	*msg = uwq->msg;
1063
1064	if (uwq->msg.event == UFFD_EVENT_FORK) {
1065	fork_nctx = (struct userfaultfd_ctx *)
1066	(unsigned long)
1067	uwq->msg.arg.reserved.reserved1;
1068	list_move(list: &uwq->wq.entry, head: &fork_event);
1069	/*
1070	* fork_nctx can be freed as soon as
1071	* we drop the lock, unless we take a
1072	* reference on it.
1073	*/
1074	userfaultfd_ctx_get(ctx: fork_nctx);
1075	spin_unlock(lock: &ctx->event_wqh.lock);
1076	ret = `0`;
1077	break;
1078	}
1079
1080	userfaultfd_event_complete(ctx, ewq: uwq);
1081	spin_unlock(lock: &ctx->event_wqh.lock);
1082	ret = `0`;
1083	break;
1084	}
1085	spin_unlock(lock: &ctx->event_wqh.lock);
1086
1087	if (signal_pending(current)) {
1088	ret = -ERESTARTSYS;
1089	break;
1090	}
1091	if (no_wait) {
1092	ret = -EAGAIN;
1093	break;
1094	}
1095	spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1096	schedule();
1097	spin_lock_irq(lock: &ctx->fd_wqh.lock);
1098	}
1099	__remove_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1100	__set_current_state(TASK_RUNNING);
1101	spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1102
1103	if (!ret && msg->event == UFFD_EVENT_FORK) {
1104	ret = resolve_userfault_fork(new: fork_nctx, inode, msg);
1105	spin_lock_irq(lock: &ctx->event_wqh.lock);
1106	if (!list_empty(head: &fork_event)) {
1107	/*
1108	* The fork thread didn't abort, so we can
1109	* drop the temporary refcount.
1110	*/
1111	userfaultfd_ctx_put(ctx: fork_nctx);
1112
1113	uwq = list_first_entry(&fork_event,
1114	typeof(*uwq),
1115	wq.entry);
1116	/*
1117	* If fork_event list wasn't empty and in turn
1118	* the event wasn't already released by fork
1119	* (the event is allocated on fork kernel
1120	* stack), put the event back to its place in
1121	* the event_wq. fork_event head will be freed
1122	* as soon as we return so the event cannot
1123	* stay queued there no matter the current
1124	* "ret" value.
1125	*/
1126	list_del(entry: &uwq->wq.entry);
1127	__add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &uwq->wq);
1128
1129	/*
1130	* Leave the event in the waitqueue and report
1131	* error to userland if we failed to resolve
1132	* the userfault fork.
1133	*/
1134	if (likely(!ret))
1135	userfaultfd_event_complete(ctx, ewq: uwq);
1136	} else {
1137	/*
1138	* Here the fork thread aborted and the
1139	* refcount from the fork thread on fork_nctx
1140	* has already been released. We still hold
1141	* the reference we took before releasing the
1142	* lock above. If resolve_userfault_fork
1143	* failed we've to drop it because the
1144	* fork_nctx has to be freed in such case. If
1145	* it succeeded we'll hold it because the new
1146	* uffd references it.
1147	*/
1148	if (ret)
1149	userfaultfd_ctx_put(ctx: fork_nctx);
1150	}
1151	spin_unlock_irq(lock: &ctx->event_wqh.lock);
1152	}
1153
1154	return ret;
1155	}
1156
1157	static ssize_t userfaultfd_read_iter(struct kiocb iocb, struct* iov_iter *to)
1158	{
1159	struct file *file = iocb->ki_filp;
1160	struct userfaultfd_ctx *ctx = file->private_data;
1161	ssize_t _ret, ret = `0`;
1162	struct uffd_msg msg;
1163	struct inode *inode = file_inode(f: file);
1164	bool no_wait;
1165
1166	if (!userfaultfd_is_initialized(ctx))
1167	return -EINVAL;
1168
1169	no_wait = file->f_flags & O_NONBLOCK \|\| iocb->ki_flags & IOCB_NOWAIT;
1170	for (;;) {
1171	if (iov_iter_count(i: to) < sizeof(msg))
1172	return ret ? ret : -EINVAL;
1173	_ret = userfaultfd_ctx_read(ctx, no_wait, msg: &msg, inode);
1174	if (_ret < `0`)
1175	return ret ? ret : _ret;
1176	_ret = !copy_to_iter_full(addr: &msg, bytes: sizeof(msg), i: to);
1177	if (_ret)
1178	return ret ? ret : -EFAULT;
1179	ret += sizeof(msg);
1180	/*
1181	* Allow to read more than one fault at time but only
1182	* block if waiting for the very first one.
1183	*/
1184	no_wait = true;
1185	}
1186	}
1187
1188	static void __wake_userfault(struct userfaultfd_ctx *ctx,
1189	struct userfaultfd_wake_range *range)
1190	{
1191	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
1192	/ wake all in the range and autoremove /
1193	if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
1194	__wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL,
1195	key: range);
1196	if (waitqueue_active(wq_head: &ctx->fault_wqh))
1197	__wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: `1`, key: range);
1198	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
1199	}
1200
1201	static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1202	struct userfaultfd_wake_range *range)
1203	{
1204	unsigned seq;
1205	bool need_wakeup;
1206
1207	/*
1208	* To be sure waitqueue_active() is not reordered by the CPU
1209	* before the pagetable update, use an explicit SMP memory
1210	* barrier here. PT lock release or mmap_read_unlock(mm) still
1211	* have release semantics that can allow the
1212	* waitqueue_active() to be reordered before the pte update.
1213	*/
1214	smp_mb();
1215
1216	/*
1217	* Use waitqueue_active because it's very frequent to
1218	* change the address space atomically even if there are no
1219	* userfaults yet. So we take the spinlock only when we're
1220	* sure we've userfaults to wake.
1221	*/
1222	do {
1223	seq = read_seqcount_begin(&ctx->refile_seq);
1224	need_wakeup = waitqueue_active(wq_head: &ctx->fault_pending_wqh) \|\|
1225	waitqueue_active(wq_head: &ctx->fault_wqh);
1226	cond_resched();
1227	} while (read_seqcount_retry(&ctx->refile_seq, seq));
1228	if (need_wakeup)
1229	__wake_userfault(ctx, range);
1230	}
1231
1232	static __always_inline int validate_unaligned_range(
1233	struct mm_struct *mm, __u64 start, __u64 len)
1234	{
1235	__u64 task_size = mm->task_size;
1236
1237	if (len & ~PAGE_MASK)
1238	return -EINVAL;
1239	if (!len)
1240	return -EINVAL;
1241	if (start < mmap_min_addr)
1242	return -EINVAL;
1243	if (start >= task_size)
1244	return -EINVAL;
1245	if (len > task_size - start)
1246	return -EINVAL;
1247	if (start + len <= start)
1248	return -EINVAL;
1249	return `0`;
1250	}
1251
1252	static __always_inline int validate_range(struct mm_struct *mm,
1253	__u64 start, __u64 len)
1254	{
1255	if (start & ~PAGE_MASK)
1256	return -EINVAL;
1257
1258	return validate_unaligned_range(mm, start, len);
1259	}
1260
1261	static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1262	unsigned long arg)
1263	{
1264	struct mm_struct *mm = ctx->mm;
1265	struct vm_area_struct vma, cur;
1266	int ret;
1267	struct uffdio_register uffdio_register;
1268	struct uffdio_register __user *user_uffdio_register;
1269	vm_flags_t vm_flags;
1270	bool found;
1271	bool basic_ioctls;
1272	unsigned long start, end;
1273	struct vma_iterator vmi;
1274	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1275
1276	user_uffdio_register = (struct uffdio_register __user *) arg;
1277
1278	ret = -EFAULT;
1279	if (copy_from_user(to: &uffdio_register, from: user_uffdio_register,
1280	n: sizeof(uffdio_register)-sizeof(__u64)))
1281	goto out;
1282
1283	ret = -EINVAL;
1284	if (!uffdio_register.mode)
1285	goto out;
1286	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1287	goto out;
1288	vm_flags = `0`;
1289	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1290	vm_flags \|= VM_UFFD_MISSING;
1291	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1292	if (!pgtable_supports_uffd_wp())
1293	goto out;
1294
1295	vm_flags \|= VM_UFFD_WP;
1296	}
1297	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1298	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1299	goto out;
1300	#endif
1301	vm_flags \|= VM_UFFD_MINOR;
1302	}
1303
1304	ret = validate_range(mm, start: uffdio_register.range.start,
1305	len: uffdio_register.range.len);
1306	if (ret)
1307	goto out;
1308
1309	start = uffdio_register.range.start;
1310	end = start + uffdio_register.range.len;
1311
1312	ret = -ENOMEM;
1313	if (!mmget_not_zero(mm))
1314	goto out;
1315
1316	ret = -EINVAL;
1317	mmap_write_lock(mm);
1318	vma_iter_init(vmi: &vmi, mm, addr: start);
1319	vma = vma_find(vmi: &vmi, max: end);
1320	if (!vma)
1321	goto out_unlock;
1322
1323	/*
1324	* If the first vma contains huge pages, make sure start address
1325	* is aligned to huge page size.
1326	*/
1327	if (is_vm_hugetlb_page(vma)) {
1328	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1329
1330	if (start & (vma_hpagesize - `1`))
1331	goto out_unlock;
1332	}
1333
1334	/*
1335	* Search for not compatible vmas.
1336	*/
1337	found = false;
1338	basic_ioctls = false;
1339	cur = vma;
1340	do {
1341	cond_resched();
1342
1343	VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
1344	!!(cur->vm_flags & __VM_UFFD_FLAGS));
1345
1346	/ check not compatible vmas /
1347	ret = -EINVAL;
1348	if (!vma_can_userfault(vma: cur, vm_flags, wp_async))
1349	goto out_unlock;
1350
1351	/*
1352	* UFFDIO_COPY will fill file holes even without
1353	* PROT_WRITE. This check enforces that if this is a
1354	* MAP_SHARED, the process has write permission to the backing
1355	* file. If VM_MAYWRITE is set it also enforces that on a
1356	* MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1357	* F_WRITE_SEAL can be taken until the vma is destroyed.
1358	*/
1359	ret = -EPERM;
1360	if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1361	goto out_unlock;
1362
1363	/*
1364	* If this vma contains ending address, and huge pages
1365	* check alignment.
1366	*/
1367	if (is_vm_hugetlb_page(vma: cur) && end <= cur->vm_end &&
1368	end > cur->vm_start) {
1369	unsigned long vma_hpagesize = vma_kernel_pagesize(vma: cur);
1370
1371	ret = -EINVAL;
1372
1373	if (end & (vma_hpagesize - `1`))
1374	goto out_unlock;
1375	}
1376	if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1377	goto out_unlock;
1378
1379	/*
1380	* Check that this vma isn't already owned by a
1381	* different userfaultfd. We can't allow more than one
1382	* userfaultfd to own a single vma simultaneously or we
1383	* wouldn't know which one to deliver the userfaults to.
1384	*/
1385	ret = -EBUSY;
1386	if (cur->vm_userfaultfd_ctx.ctx &&
1387	cur->vm_userfaultfd_ctx.ctx != ctx)
1388	goto out_unlock;
1389
1390	/*
1391	* Note vmas containing huge pages
1392	*/
1393	if (is_vm_hugetlb_page(vma: cur))
1394	basic_ioctls = true;
1395
1396	found = true;
1397	} for_each_vma_range(vmi, cur, end);
1398	VM_WARN_ON_ONCE(!found);
1399
1400	ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
1401	wp_async);
1402
1403	out_unlock:
1404	mmap_write_unlock(mm);
1405	mmput(mm);
1406	if (!ret) {
1407	__u64 ioctls_out;
1408
1409	ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1410	UFFD_API_RANGE_IOCTLS;
1411
1412	/*
1413	* Declare the WP ioctl only if the WP mode is
1414	* specified and all checks passed with the range
1415	*/
1416	if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1417	ioctls_out &= ~((__u64)`1` << _UFFDIO_WRITEPROTECT);
1418
1419	/ CONTINUE ioctl is only supported for MINOR ranges. /
1420	if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1421	ioctls_out &= ~((__u64)`1` << _UFFDIO_CONTINUE);
1422
1423	/*
1424	* Now that we scanned all vmas we can already tell
1425	* userland which ioctls methods are guaranteed to
1426	* succeed on this range.
1427	*/
1428	if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1429	ret = -EFAULT;
1430	}
1431	out:
1432	return ret;
1433	}
1434
1435	static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1436	unsigned long arg)
1437	{
1438	struct mm_struct *mm = ctx->mm;
1439	struct vm_area_struct vma, prev, *cur;
1440	int ret;
1441	struct uffdio_range uffdio_unregister;
1442	bool found;
1443	unsigned long start, end, vma_end;
1444	const void __user buf = (void* __user *)arg;
1445	struct vma_iterator vmi;
1446	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1447
1448	ret = -EFAULT;
1449	if (copy_from_user(to: &uffdio_unregister, from: buf, n: sizeof(uffdio_unregister)))
1450	goto out;
1451
1452	ret = validate_range(mm, start: uffdio_unregister.start,
1453	len: uffdio_unregister.len);
1454	if (ret)
1455	goto out;
1456
1457	start = uffdio_unregister.start;
1458	end = start + uffdio_unregister.len;
1459
1460	ret = -ENOMEM;
1461	if (!mmget_not_zero(mm))
1462	goto out;
1463
1464	mmap_write_lock(mm);
1465	ret = -EINVAL;
1466	vma_iter_init(vmi: &vmi, mm, addr: start);
1467	vma = vma_find(vmi: &vmi, max: end);
1468	if (!vma)
1469	goto out_unlock;
1470
1471	/*
1472	* If the first vma contains huge pages, make sure start address
1473	* is aligned to huge page size.
1474	*/
1475	if (is_vm_hugetlb_page(vma)) {
1476	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1477
1478	if (start & (vma_hpagesize - `1`))
1479	goto out_unlock;
1480	}
1481
1482	/*
1483	* Search for not compatible vmas.
1484	*/
1485	found = false;
1486	cur = vma;
1487	do {
1488	cond_resched();
1489
1490	VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
1491	!!(cur->vm_flags & __VM_UFFD_FLAGS));
1492
1493	/*
1494	* Prevent unregistering through a different userfaultfd than
1495	* the one used for registration.
1496	*/
1497	if (cur->vm_userfaultfd_ctx.ctx &&
1498	cur->vm_userfaultfd_ctx.ctx != ctx)
1499	goto out_unlock;
1500
1501	/*
1502	* Check not compatible vmas, not strictly required
1503	* here as not compatible vmas cannot have an
1504	* userfaultfd_ctx registered on them, but this
1505	* provides for more strict behavior to notice
1506	* unregistration errors.
1507	*/
1508	if (!vma_can_userfault(vma: cur, vm_flags: cur->vm_flags, wp_async))
1509	goto out_unlock;
1510
1511	found = true;
1512	} for_each_vma_range(vmi, cur, end);
1513	VM_WARN_ON_ONCE(!found);
1514
1515	vma_iter_set(vmi: &vmi, addr: start);
1516	prev = vma_prev(vmi: &vmi);
1517	if (vma->vm_start < start)
1518	prev = vma;
1519
1520	ret = `0`;
1521	for_each_vma_range(vmi, vma, end) {
1522	cond_resched();
1523
1524	/ VMA not registered with userfaultfd. /
1525	if (!vma->vm_userfaultfd_ctx.ctx)
1526	goto skip;
1527
1528	VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
1529	VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1530	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
1531
1532	if (vma->vm_start > start)
1533	start = vma->vm_start;
1534	vma_end = min(end, vma->vm_end);
1535
1536	if (userfaultfd_missing(vma)) {
1537	/*
1538	* Wake any concurrent pending userfault while
1539	* we unregister, so they will not hang
1540	* permanently and it avoids userland to call
1541	* UFFDIO_WAKE explicitly.
1542	*/
1543	struct userfaultfd_wake_range range;
1544	range.start = start;
1545	range.len = vma_end - start;
1546	wake_userfault(ctx: vma->vm_userfaultfd_ctx.ctx, range: &range);
1547	}
1548
1549	vma = userfaultfd_clear_vma(vmi: &vmi, prev, vma,
1550	start, end: vma_end);
1551	if (IS_ERR(ptr: vma)) {
1552	ret = PTR_ERR(ptr: vma);
1553	break;
1554	}
1555
1556	skip:
1557	prev = vma;
1558	start = vma->vm_end;
1559	}
1560
1561	out_unlock:
1562	mmap_write_unlock(mm);
1563	mmput(mm);
1564	out:
1565	return ret;
1566	}
1567
1568	/*
1569	* userfaultfd_wake may be used in combination with the
1570	* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1571	*/
1572	static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1573	unsigned long arg)
1574	{
1575	int ret;
1576	struct uffdio_range uffdio_wake;
1577	struct userfaultfd_wake_range range;
1578	const void __user buf = (void* __user *)arg;
1579
1580	ret = -EFAULT;
1581	if (copy_from_user(to: &uffdio_wake, from: buf, n: sizeof(uffdio_wake)))
1582	goto out;
1583
1584	ret = validate_range(mm: ctx->mm, start: uffdio_wake.start, len: uffdio_wake.len);
1585	if (ret)
1586	goto out;
1587
1588	range.start = uffdio_wake.start;
1589	range.len = uffdio_wake.len;
1590
1591	/*
1592	* len == 0 means wake all and we don't want to wake all here,
1593	* so check it again to be sure.
1594	*/
1595	VM_WARN_ON_ONCE(!range.len);
1596
1597	wake_userfault(ctx, range: &range);
1598	ret = `0`;
1599
1600	out:
1601	return ret;
1602	}
1603
1604	static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1605	unsigned long arg)
1606	{
1607	__s64 ret;
1608	struct uffdio_copy uffdio_copy;
1609	struct uffdio_copy __user *user_uffdio_copy;
1610	struct userfaultfd_wake_range range;
1611	uffd_flags_t flags = `0`;
1612
1613	user_uffdio_copy = (struct uffdio_copy __user *) arg;
1614
1615	ret = -EAGAIN;
1616	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1617	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1618	return -EFAULT;
1619	goto out;
1620	}
1621
1622	ret = -EFAULT;
1623	if (copy_from_user(to: &uffdio_copy, from: user_uffdio_copy,
1624	/ don't copy "copy" last field /
1625	n: sizeof(uffdio_copy)-sizeof(__s64)))
1626	goto out;
1627
1628	ret = validate_unaligned_range(mm: ctx->mm, start: uffdio_copy.src,
1629	len: uffdio_copy.len);
1630	if (ret)
1631	goto out;
1632	ret = validate_range(mm: ctx->mm, start: uffdio_copy.dst, len: uffdio_copy.len);
1633	if (ret)
1634	goto out;
1635
1636	ret = -EINVAL;
1637	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE\|UFFDIO_COPY_MODE_WP))
1638	goto out;
1639	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1640	flags \|= MFILL_ATOMIC_WP;
1641	if (mmget_not_zero(mm: ctx->mm)) {
1642	ret = mfill_atomic_copy(ctx, dst_start: uffdio_copy.dst, src_start: uffdio_copy.src,
1643	len: uffdio_copy.len, flags);
1644	mmput(ctx->mm);
1645	} else {
1646	return -ESRCH;
1647	}
1648	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1649	return -EFAULT;
1650	if (ret < `0`)
1651	goto out;
1652	VM_WARN_ON_ONCE(!ret);
1653	/ len == 0 would wake all /
1654	range.len = ret;
1655	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1656	range.start = uffdio_copy.dst;
1657	wake_userfault(ctx, range: &range);
1658	}
1659	ret = range.len == uffdio_copy.len ? `0` : -EAGAIN;
1660	out:
1661	return ret;
1662	}
1663
1664	static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1665	unsigned long arg)
1666	{
1667	__s64 ret;
1668	struct uffdio_zeropage uffdio_zeropage;
1669	struct uffdio_zeropage __user *user_uffdio_zeropage;
1670	struct userfaultfd_wake_range range;
1671
1672	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1673
1674	ret = -EAGAIN;
1675	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1676	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1677	return -EFAULT;
1678	goto out;
1679	}
1680
1681	ret = -EFAULT;
1682	if (copy_from_user(to: &uffdio_zeropage, from: user_uffdio_zeropage,
1683	/ don't copy "zeropage" last field /
1684	n: sizeof(uffdio_zeropage)-sizeof(__s64)))
1685	goto out;
1686
1687	ret = validate_range(mm: ctx->mm, start: uffdio_zeropage.range.start,
1688	len: uffdio_zeropage.range.len);
1689	if (ret)
1690	goto out;
1691	ret = -EINVAL;
1692	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1693	goto out;
1694
1695	if (mmget_not_zero(mm: ctx->mm)) {
1696	ret = mfill_atomic_zeropage(ctx, dst_start: uffdio_zeropage.range.start,
1697	len: uffdio_zeropage.range.len);
1698	mmput(ctx->mm);
1699	} else {
1700	return -ESRCH;
1701	}
1702	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1703	return -EFAULT;
1704	if (ret < `0`)
1705	goto out;
1706	/ len == 0 would wake all /
1707	VM_WARN_ON_ONCE(!ret);
1708	range.len = ret;
1709	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1710	range.start = uffdio_zeropage.range.start;
1711	wake_userfault(ctx, range: &range);
1712	}
1713	ret = range.len == uffdio_zeropage.range.len ? `0` : -EAGAIN;
1714	out:
1715	return ret;
1716	}
1717
1718	static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1719	unsigned long arg)
1720	{
1721	int ret;
1722	struct uffdio_writeprotect uffdio_wp;
1723	struct uffdio_writeprotect __user *user_uffdio_wp;
1724	struct userfaultfd_wake_range range;
1725	bool mode_wp, mode_dontwake;
1726
1727	if (atomic_read(v: &ctx->mmap_changing))
1728	return -EAGAIN;
1729
1730	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1731
1732	if (copy_from_user(to: &uffdio_wp, from: user_uffdio_wp,
1733	n: sizeof(struct uffdio_writeprotect)))
1734	return -EFAULT;
1735
1736	ret = validate_range(mm: ctx->mm, start: uffdio_wp.range.start,
1737	len: uffdio_wp.range.len);
1738	if (ret)
1739	return ret;
1740
1741	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE \|
1742	UFFDIO_WRITEPROTECT_MODE_WP))
1743	return -EINVAL;
1744
1745	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1746	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1747
1748	if (mode_wp && mode_dontwake)
1749	return -EINVAL;
1750
1751	if (mmget_not_zero(mm: ctx->mm)) {
1752	ret = mwriteprotect_range(ctx, start: uffdio_wp.range.start,
1753	len: uffdio_wp.range.len, enable_wp: mode_wp);
1754	mmput(ctx->mm);
1755	} else {
1756	return -ESRCH;
1757	}
1758
1759	if (ret)
1760	return ret;
1761
1762	if (!mode_wp && !mode_dontwake) {
1763	range.start = uffdio_wp.range.start;
1764	range.len = uffdio_wp.range.len;
1765	wake_userfault(ctx, range: &range);
1766	}
1767	return ret;
1768	}
1769
1770	static int userfaultfd_continue(struct userfaultfd_ctx ctx, unsigned* long arg)
1771	{
1772	__s64 ret;
1773	struct uffdio_continue uffdio_continue;
1774	struct uffdio_continue __user *user_uffdio_continue;
1775	struct userfaultfd_wake_range range;
1776	uffd_flags_t flags = `0`;
1777
1778	user_uffdio_continue = (struct uffdio_continue __user *)arg;
1779
1780	ret = -EAGAIN;
1781	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1782	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1783	return -EFAULT;
1784	goto out;
1785	}
1786
1787	ret = -EFAULT;
1788	if (copy_from_user(to: &uffdio_continue, from: user_uffdio_continue,
1789	/ don't copy the output fields /
1790	n: sizeof(uffdio_continue) - (sizeof(__s64))))
1791	goto out;
1792
1793	ret = validate_range(mm: ctx->mm, start: uffdio_continue.range.start,
1794	len: uffdio_continue.range.len);
1795	if (ret)
1796	goto out;
1797
1798	ret = -EINVAL;
1799	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE \|
1800	UFFDIO_CONTINUE_MODE_WP))
1801	goto out;
1802	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1803	flags \|= MFILL_ATOMIC_WP;
1804
1805	if (mmget_not_zero(mm: ctx->mm)) {
1806	ret = mfill_atomic_continue(ctx, dst_start: uffdio_continue.range.start,
1807	len: uffdio_continue.range.len, flags);
1808	mmput(ctx->mm);
1809	} else {
1810	return -ESRCH;
1811	}
1812
1813	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1814	return -EFAULT;
1815	if (ret < `0`)
1816	goto out;
1817
1818	/ len == 0 would wake all /
1819	VM_WARN_ON_ONCE(!ret);
1820	range.len = ret;
1821	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1822	range.start = uffdio_continue.range.start;
1823	wake_userfault(ctx, range: &range);
1824	}
1825	ret = range.len == uffdio_continue.range.len ? `0` : -EAGAIN;
1826
1827	out:
1828	return ret;
1829	}
1830
1831	static inline int userfaultfd_poison(struct userfaultfd_ctx ctx, unsigned* long arg)
1832	{
1833	__s64 ret;
1834	struct uffdio_poison uffdio_poison;
1835	struct uffdio_poison __user *user_uffdio_poison;
1836	struct userfaultfd_wake_range range;
1837
1838	user_uffdio_poison = (struct uffdio_poison __user *)arg;
1839
1840	ret = -EAGAIN;
1841	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1842	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1843	return -EFAULT;
1844	goto out;
1845	}
1846
1847	ret = -EFAULT;
1848	if (copy_from_user(to: &uffdio_poison, from: user_uffdio_poison,
1849	/ don't copy the output fields /
1850	n: sizeof(uffdio_poison) - (sizeof(__s64))))
1851	goto out;
1852
1853	ret = validate_range(mm: ctx->mm, start: uffdio_poison.range.start,
1854	len: uffdio_poison.range.len);
1855	if (ret)
1856	goto out;
1857
1858	ret = -EINVAL;
1859	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1860	goto out;
1861
1862	if (mmget_not_zero(mm: ctx->mm)) {
1863	ret = mfill_atomic_poison(ctx, start: uffdio_poison.range.start,
1864	len: uffdio_poison.range.len, flags: `0`);
1865	mmput(ctx->mm);
1866	} else {
1867	return -ESRCH;
1868	}
1869
1870	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1871	return -EFAULT;
1872	if (ret < `0`)
1873	goto out;
1874
1875	/ len == 0 would wake all /
1876	VM_WARN_ON_ONCE(!ret);
1877	range.len = ret;
1878	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1879	range.start = uffdio_poison.range.start;
1880	wake_userfault(ctx, range: &range);
1881	}
1882	ret = range.len == uffdio_poison.range.len ? `0` : -EAGAIN;
1883
1884	out:
1885	return ret;
1886	}
1887
1888	bool userfaultfd_wp_async(struct vm_area_struct *vma)
1889	{
1890	return userfaultfd_wp_async_ctx(ctx: vma->vm_userfaultfd_ctx.ctx);
1891	}
1892
1893	static inline unsigned int uffd_ctx_features(__u64 user_features)
1894	{
1895	/*
1896	* For the current set of features the bits just coincide. Set
1897	* UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1898	*/
1899	return (unsigned int)user_features \| UFFD_FEATURE_INITIALIZED;
1900	}
1901
1902	static int userfaultfd_move(struct userfaultfd_ctx *ctx,
1903	unsigned long arg)
1904	{
1905	__s64 ret;
1906	struct uffdio_move uffdio_move;
1907	struct uffdio_move __user *user_uffdio_move;
1908	struct userfaultfd_wake_range range;
1909	struct mm_struct *mm = ctx->mm;
1910
1911	user_uffdio_move = (struct uffdio_move __user *) arg;
1912
1913	ret = -EAGAIN;
1914	if (unlikely(atomic_read(&ctx->mmap_changing))) {
1915	if (unlikely(put_user(ret, &user_uffdio_move->move)))
1916	return -EFAULT;
1917	goto out;
1918	}
1919
1920	if (copy_from_user(to: &uffdio_move, from: user_uffdio_move,
1921	/ don't copy "move" last field /
1922	n: sizeof(uffdio_move)-sizeof(__s64)))
1923	return -EFAULT;
1924
1925	/ Do not allow cross-mm moves. /
1926	if (mm != current->mm)
1927	return -EINVAL;
1928
1929	ret = validate_range(mm, start: uffdio_move.dst, len: uffdio_move.len);
1930	if (ret)
1931	return ret;
1932
1933	ret = validate_range(mm, start: uffdio_move.src, len: uffdio_move.len);
1934	if (ret)
1935	return ret;
1936
1937	if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES\|
1938	UFFDIO_MOVE_MODE_DONTWAKE))
1939	return -EINVAL;
1940
1941	if (mmget_not_zero(mm)) {
1942	ret = move_pages(ctx, dst_start: uffdio_move.dst, src_start: uffdio_move.src,
1943	len: uffdio_move.len, flags: uffdio_move.mode);
1944	mmput(mm);
1945	} else {
1946	return -ESRCH;
1947	}
1948
1949	if (unlikely(put_user(ret, &user_uffdio_move->move)))
1950	return -EFAULT;
1951	if (ret < `0`)
1952	goto out;
1953
1954	/ len == 0 would wake all /
1955	VM_WARN_ON(!ret);
1956	range.len = ret;
1957	if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
1958	range.start = uffdio_move.dst;
1959	wake_userfault(ctx, range: &range);
1960	}
1961	ret = range.len == uffdio_move.len ? `0` : -EAGAIN;
1962
1963	out:
1964	return ret;
1965	}
1966
1967	/*
1968	* userland asks for a certain API version and we return which bits
1969	* and ioctl commands are implemented in this kernel for such API
1970	* version or -EINVAL if unknown.
1971	*/
1972	static int userfaultfd_api(struct userfaultfd_ctx *ctx,
1973	unsigned long arg)
1974	{
1975	struct uffdio_api uffdio_api;
1976	void __user buf = (void* __user *)arg;
1977	unsigned int ctx_features;
1978	int ret;
1979	__u64 features;
1980
1981	ret = -EFAULT;
1982	if (copy_from_user(to: &uffdio_api, from: buf, n: sizeof(uffdio_api)))
1983	goto out;
1984	features = uffdio_api.features;
1985	ret = -EINVAL;
1986	if (uffdio_api.api != UFFD_API)
1987	goto err_out;
1988	ret = -EPERM;
1989	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
1990	goto err_out;
1991
1992	/ WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally /
1993	if (features & UFFD_FEATURE_WP_ASYNC)
1994	features \|= UFFD_FEATURE_WP_UNPOPULATED;
1995
1996	/ report all available features and ioctls to userland /
1997	uffdio_api.features = UFFD_API_FEATURES;
1998	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1999	uffdio_api.features &=
2000	~(UFFD_FEATURE_MINOR_HUGETLBFS \| UFFD_FEATURE_MINOR_SHMEM);
2001	#endif
2002	if (!pgtable_supports_uffd_wp())
2003	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2004
2005	if (!uffd_supports_wp_marker()) {
2006	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2007	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2008	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2009	}
2010
2011	ret = -EINVAL;
2012	if (features & ~uffdio_api.features)
2013	goto err_out;
2014
2015	uffdio_api.ioctls = UFFD_API_IOCTLS;
2016	ret = -EFAULT;
2017	if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2018	goto out;
2019
2020	/ only enable the requested features for this uffd context /
2021	ctx_features = uffd_ctx_features(user_features: features);
2022	ret = -EINVAL;
2023	if (cmpxchg(&ctx->features, `0`, ctx_features) != `0`)
2024	goto err_out;
2025
2026	ret = `0`;
2027	out:
2028	return ret;
2029	err_out:
2030	memset(&uffdio_api, `0`, sizeof(uffdio_api));
2031	if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2032	ret = -EFAULT;
2033	goto out;
2034	}
2035
2036	static long userfaultfd_ioctl(struct file file, unsigned* cmd,
2037	unsigned long arg)
2038	{
2039	int ret = -EINVAL;
2040	struct userfaultfd_ctx *ctx = file->private_data;
2041
2042	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2043	return -EINVAL;
2044
2045	switch(cmd) {
2046	case UFFDIO_API:
2047	ret = userfaultfd_api(ctx, arg);
2048	break;
2049	case UFFDIO_REGISTER:
2050	ret = userfaultfd_register(ctx, arg);
2051	break;
2052	case UFFDIO_UNREGISTER:
2053	ret = userfaultfd_unregister(ctx, arg);
2054	break;
2055	case UFFDIO_WAKE:
2056	ret = userfaultfd_wake(ctx, arg);
2057	break;
2058	case UFFDIO_COPY:
2059	ret = userfaultfd_copy(ctx, arg);
2060	break;
2061	case UFFDIO_ZEROPAGE:
2062	ret = userfaultfd_zeropage(ctx, arg);
2063	break;
2064	case UFFDIO_MOVE:
2065	ret = userfaultfd_move(ctx, arg);
2066	break;
2067	case UFFDIO_WRITEPROTECT:
2068	ret = userfaultfd_writeprotect(ctx, arg);
2069	break;
2070	case UFFDIO_CONTINUE:
2071	ret = userfaultfd_continue(ctx, arg);
2072	break;
2073	case UFFDIO_POISON:
2074	ret = userfaultfd_poison(ctx, arg);
2075	break;
2076	}
2077	return ret;
2078	}
2079
2080	#ifdef CONFIG_PROC_FS
2081	static void userfaultfd_show_fdinfo(struct seq_file m, struct* file *f)
2082	{
2083	struct userfaultfd_ctx *ctx = f->private_data;
2084	wait_queue_entry_t *wq;
2085	unsigned long pending = `0`, total = `0`;
2086
2087	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
2088	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2089	pending++;
2090	total++;
2091	}
2092	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2093	total++;
2094	}
2095	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
2096
2097	/*
2098	* If more protocols will be added, there will be all shown
2099	* separated by a space. Like this:
2100	* protocols: aa:... bb:...
2101	*/
2102	seq_printf(m, fmt: "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2103	pending, total, UFFD_API, ctx->features,
2104	UFFD_API_IOCTLS\|UFFD_API_RANGE_IOCTLS);
2105	}
2106	#endif
2107
2108	static const struct file_operations userfaultfd_fops = {
2109	#ifdef CONFIG_PROC_FS
2110	.show_fdinfo = userfaultfd_show_fdinfo,
2111	#endif
2112	.release = userfaultfd_release,
2113	.poll = userfaultfd_poll,
2114	.read_iter = userfaultfd_read_iter,
2115	.unlocked_ioctl = userfaultfd_ioctl,
2116	.compat_ioctl = compat_ptr_ioctl,
2117	.llseek = noop_llseek,
2118	};
2119
2120	static void init_once_userfaultfd_ctx(void *mem)
2121	{
2122	struct userfaultfd_ctx ctx = (struct* userfaultfd_ctx *) mem;
2123
2124	init_waitqueue_head(&ctx->fault_pending_wqh);
2125	init_waitqueue_head(&ctx->fault_wqh);
2126	init_waitqueue_head(&ctx->event_wqh);
2127	init_waitqueue_head(&ctx->fd_wqh);
2128	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2129	}
2130
2131	static int new_userfaultfd(int flags)
2132	{
2133	struct userfaultfd_ctx *ctx __free(kfree) = NULL;
2134
2135	VM_WARN_ON_ONCE(!current->mm);
2136
2137	/ Check the UFFD_* constants for consistency. /
2138	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2139
2140	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS \| UFFD_USER_MODE_ONLY))
2141	return -EINVAL;
2142
2143	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2144	if (!ctx)
2145	return -ENOMEM;
2146
2147	refcount_set(r: &ctx->refcount, n: `1`);
2148	ctx->flags = flags;
2149	ctx->features = `0`;
2150	ctx->released = false;
2151	init_rwsem(&ctx->map_changing_lock);
2152	atomic_set(v: &ctx->mmap_changing, i: `0`);
2153	ctx->mm = current->mm;
2154
2155	FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
2156	anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
2157	O_RDONLY \| (flags & UFFD_SHARED_FCNTL_FLAGS),
2158	NULL));
2159	if (fdf.err)
2160	return fdf.err;
2161
2162	/ prevent the mm struct to be freed /
2163	mmgrab(mm: ctx->mm);
2164	fd_prepare_file(fdf)->f_mode \|= FMODE_NOWAIT;
2165	retain_and_null_ptr(ctx);
2166	return fd_publish(fdf);
2167	}
2168
2169	static inline bool userfaultfd_syscall_allowed(int flags)
2170	{
2171	/ Userspace-only page faults are always allowed /
2172	if (flags & UFFD_USER_MODE_ONLY)
2173	return true;
2174
2175	/*
2176	* The user is requesting a userfaultfd which can handle kernel faults.
2177	* Privileged users are always allowed to do this.
2178	*/
2179	if (capable(CAP_SYS_PTRACE))
2180	return true;
2181
2182	/ Otherwise, access to kernel fault handling is sysctl controlled. /
2183	return sysctl_unprivileged_userfaultfd;
2184	}
2185
2186	SYSCALL_DEFINE1(userfaultfd, int, flags)
2187	{
2188	if (!userfaultfd_syscall_allowed(flags))
2189	return -EPERM;
2190
2191	return new_userfaultfd(flags);
2192	}
2193
2194	static long userfaultfd_dev_ioctl(struct file file, unsigned* int cmd, unsigned long flags)
2195	{
2196	if (cmd != USERFAULTFD_IOC_NEW)
2197	return -EINVAL;
2198
2199	return new_userfaultfd(flags);
2200	}
2201
2202	static const struct file_operations userfaultfd_dev_fops = {
2203	.unlocked_ioctl = userfaultfd_dev_ioctl,
2204	.compat_ioctl = userfaultfd_dev_ioctl,
2205	.owner = THIS_MODULE,
2206	.llseek = noop_llseek,
2207	};
2208
2209	static struct miscdevice userfaultfd_misc = {
2210	.minor = MISC_DYNAMIC_MINOR,
2211	.name = "userfaultfd",
2212	.fops = &userfaultfd_dev_fops
2213	};
2214
2215	static int __init userfaultfd_init(void)
2216	{
2217	int ret;
2218
2219	ret = misc_register(misc: &userfaultfd_misc);
2220	if (ret)
2221	return ret;
2222
2223	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2224	sizeof(struct userfaultfd_ctx),
2225	`0`,
2226	SLAB_HWCACHE_ALIGN\|SLAB_PANIC,
2227	init_once_userfaultfd_ctx);
2228	#ifdef CONFIG_SYSCTL
2229	register_sysctl_init("vm", vm_userfaultfd_table);
2230	#endif
2231	return `0`;
2232	}
2233	__initcall(userfaultfd_init);
2234

source code of linux/fs/userfaultfd.c