| 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | #ifndef _LINUX_RSEQ_ENTRY_H |
| 3 | #define _LINUX_RSEQ_ENTRY_H |
| 4 | |
| 5 | /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ |
| 6 | #ifdef CONFIG_RSEQ_STATS |
| 7 | #include <linux/percpu.h> |
| 8 | |
| 9 | struct rseq_stats { |
| 10 | unsigned long exit; |
| 11 | unsigned long signal; |
| 12 | unsigned long slowpath; |
| 13 | unsigned long fastpath; |
| 14 | unsigned long ids; |
| 15 | unsigned long cs; |
| 16 | unsigned long clear; |
| 17 | unsigned long fixup; |
| 18 | }; |
| 19 | |
| 20 | DECLARE_PER_CPU(struct rseq_stats, rseq_stats); |
| 21 | |
| 22 | /* |
| 23 | * Slow path has interrupts and preemption enabled, but the fast path |
| 24 | * runs with interrupts disabled so there is no point in having the |
| 25 | * preemption checks implied in __this_cpu_inc() for every operation. |
| 26 | */ |
| 27 | #ifdef RSEQ_BUILD_SLOW_PATH |
| 28 | #define rseq_stat_inc(which) this_cpu_inc((which)) |
| 29 | #else |
| 30 | #define rseq_stat_inc(which) raw_cpu_inc((which)) |
| 31 | #endif |
| 32 | |
| 33 | #else /* CONFIG_RSEQ_STATS */ |
| 34 | #define rseq_stat_inc(x) do { } while (0) |
| 35 | #endif /* !CONFIG_RSEQ_STATS */ |
| 36 | |
| 37 | #ifdef CONFIG_RSEQ |
| 38 | #include <linux/jump_label.h> |
| 39 | #include <linux/rseq.h> |
| 40 | #include <linux/uaccess.h> |
| 41 | |
| 42 | #include <linux/tracepoint-defs.h> |
| 43 | |
| 44 | #ifdef CONFIG_TRACEPOINTS |
| 45 | DECLARE_TRACEPOINT(rseq_update); |
| 46 | DECLARE_TRACEPOINT(rseq_ip_fixup); |
| 47 | void __rseq_trace_update(struct task_struct *t); |
| 48 | void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, |
| 49 | unsigned long offset, unsigned long abort_ip); |
| 50 | |
| 51 | static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) |
| 52 | { |
| 53 | if (tracepoint_enabled(rseq_update) && ids) |
| 54 | __rseq_trace_update(t); |
| 55 | } |
| 56 | |
| 57 | static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, |
| 58 | unsigned long offset, unsigned long abort_ip) |
| 59 | { |
| 60 | if (tracepoint_enabled(rseq_ip_fixup)) |
| 61 | __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); |
| 62 | } |
| 63 | |
| 64 | #else /* CONFIG_TRACEPOINT */ |
| 65 | static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } |
| 66 | static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, |
| 67 | unsigned long offset, unsigned long abort_ip) { } |
| 68 | #endif /* !CONFIG_TRACEPOINT */ |
| 69 | |
| 70 | DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); |
| 71 | |
| 72 | #ifdef RSEQ_BUILD_SLOW_PATH |
| 73 | #define rseq_inline |
| 74 | #else |
| 75 | #define rseq_inline __always_inline |
| 76 | #endif |
| 77 | |
| 78 | bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); |
| 79 | bool rseq_debug_validate_ids(struct task_struct *t); |
| 80 | |
| 81 | static __always_inline void rseq_note_user_irq_entry(void) |
| 82 | { |
| 83 | if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) |
| 84 | current->rseq.event.user_irq = true; |
| 85 | } |
| 86 | |
| 87 | /* |
| 88 | * Check whether there is a valid critical section and whether the |
| 89 | * instruction pointer in @regs is inside the critical section. |
| 90 | * |
| 91 | * - If the critical section is invalid, terminate the task. |
| 92 | * |
| 93 | * - If valid and the instruction pointer is inside, set it to the abort IP. |
| 94 | * |
| 95 | * - If valid and the instruction pointer is outside, clear the critical |
| 96 | * section address. |
| 97 | * |
| 98 | * Returns true, if the section was valid and either fixup or clear was |
| 99 | * done, false otherwise. |
| 100 | * |
| 101 | * In the failure case task::rseq_event::fatal is set when a invalid |
| 102 | * section was found. It's clear when the failure was an unresolved page |
| 103 | * fault. |
| 104 | * |
| 105 | * If inlined into the exit to user path with interrupts disabled, the |
| 106 | * caller has to protect against page faults with pagefault_disable(). |
| 107 | * |
| 108 | * In preemptible task context this would be counterproductive as the page |
| 109 | * faults could not be fully resolved. As a consequence unresolved page |
| 110 | * faults in task context are fatal too. |
| 111 | */ |
| 112 | |
| 113 | #ifdef RSEQ_BUILD_SLOW_PATH |
| 114 | /* |
| 115 | * The debug version is put out of line, but kept here so the code stays |
| 116 | * together. |
| 117 | * |
| 118 | * @csaddr has already been checked by the caller to be in user space |
| 119 | */ |
| 120 | bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, |
| 121 | unsigned long csaddr) |
| 122 | { |
| 123 | struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; |
| 124 | u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; |
| 125 | unsigned long ip = instruction_pointer(regs); |
| 126 | u64 __user *uc_head = (u64 __user *) ucs; |
| 127 | u32 usig, __user *uc_sig; |
| 128 | |
| 129 | scoped_user_rw_access(ucs, efault) { |
| 130 | /* |
| 131 | * Evaluate the user pile and exit if one of the conditions |
| 132 | * is not fulfilled. |
| 133 | */ |
| 134 | unsafe_get_user(start_ip, &ucs->start_ip, efault); |
| 135 | if (unlikely(start_ip >= tasksize)) |
| 136 | goto die; |
| 137 | /* If outside, just clear the critical section. */ |
| 138 | if (ip < start_ip) |
| 139 | goto clear; |
| 140 | |
| 141 | unsafe_get_user(offset, &ucs->post_commit_offset, efault); |
| 142 | cs_end = start_ip + offset; |
| 143 | /* Check for overflow and wraparound */ |
| 144 | if (unlikely(cs_end >= tasksize || cs_end < start_ip)) |
| 145 | goto die; |
| 146 | |
| 147 | /* If not inside, clear it. */ |
| 148 | if (ip >= cs_end) |
| 149 | goto clear; |
| 150 | |
| 151 | unsafe_get_user(abort_ip, &ucs->abort_ip, efault); |
| 152 | /* Ensure it's "valid" */ |
| 153 | if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) |
| 154 | goto die; |
| 155 | /* Validate that the abort IP is not in the critical section */ |
| 156 | if (unlikely(abort_ip - start_ip < offset)) |
| 157 | goto die; |
| 158 | |
| 159 | /* |
| 160 | * Check version and flags for 0. No point in emitting |
| 161 | * deprecated warnings before dying. That could be done in |
| 162 | * the slow path eventually, but *shrug*. |
| 163 | */ |
| 164 | unsafe_get_user(head, uc_head, efault); |
| 165 | if (unlikely(head)) |
| 166 | goto die; |
| 167 | |
| 168 | /* abort_ip - 4 is >= 0. See abort_ip check above */ |
| 169 | uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); |
| 170 | unsafe_get_user(usig, uc_sig, efault); |
| 171 | if (unlikely(usig != t->rseq.sig)) |
| 172 | goto die; |
| 173 | |
| 174 | /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ |
| 175 | if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { |
| 176 | /* If not in interrupt from user context, let it die */ |
| 177 | if (unlikely(!t->rseq.event.user_irq)) |
| 178 | goto die; |
| 179 | } |
| 180 | unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); |
| 181 | instruction_pointer_set(regs, (unsigned long)abort_ip); |
| 182 | rseq_stat_inc(rseq_stats.fixup); |
| 183 | break; |
| 184 | clear: |
| 185 | unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); |
| 186 | rseq_stat_inc(rseq_stats.clear); |
| 187 | abort_ip = 0ULL; |
| 188 | } |
| 189 | |
| 190 | if (unlikely(abort_ip)) |
| 191 | rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); |
| 192 | return true; |
| 193 | die: |
| 194 | t->rseq.event.fatal = true; |
| 195 | efault: |
| 196 | return false; |
| 197 | } |
| 198 | |
| 199 | /* |
| 200 | * On debug kernels validate that user space did not mess with it if the |
| 201 | * debug branch is enabled. |
| 202 | */ |
| 203 | bool rseq_debug_validate_ids(struct task_struct *t) |
| 204 | { |
| 205 | struct rseq __user *rseq = t->rseq.usrptr; |
| 206 | u32 cpu_id, uval, node_id; |
| 207 | |
| 208 | /* |
| 209 | * On the first exit after registering the rseq region CPU ID is |
| 210 | * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! |
| 211 | */ |
| 212 | node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? |
| 213 | cpu_to_node(t->rseq.ids.cpu_id) : 0; |
| 214 | |
| 215 | scoped_user_read_access(rseq, efault) { |
| 216 | unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); |
| 217 | if (cpu_id != t->rseq.ids.cpu_id) |
| 218 | goto die; |
| 219 | unsafe_get_user(uval, &rseq->cpu_id, efault); |
| 220 | if (uval != cpu_id) |
| 221 | goto die; |
| 222 | unsafe_get_user(uval, &rseq->node_id, efault); |
| 223 | if (uval != node_id) |
| 224 | goto die; |
| 225 | unsafe_get_user(uval, &rseq->mm_cid, efault); |
| 226 | if (uval != t->rseq.ids.mm_cid) |
| 227 | goto die; |
| 228 | } |
| 229 | return true; |
| 230 | die: |
| 231 | t->rseq.event.fatal = true; |
| 232 | efault: |
| 233 | return false; |
| 234 | } |
| 235 | |
| 236 | #endif /* RSEQ_BUILD_SLOW_PATH */ |
| 237 | |
| 238 | /* |
| 239 | * This only ensures that abort_ip is in the user address space and |
| 240 | * validates that it is preceded by the signature. |
| 241 | * |
| 242 | * No other sanity checks are done here, that's what the debug code is for. |
| 243 | */ |
| 244 | static rseq_inline bool |
| 245 | rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) |
| 246 | { |
| 247 | struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; |
| 248 | unsigned long ip = instruction_pointer(regs); |
| 249 | unsigned long tasksize = TASK_SIZE; |
| 250 | u64 start_ip, abort_ip, offset; |
| 251 | u32 usig, __user *uc_sig; |
| 252 | |
| 253 | rseq_stat_inc(rseq_stats.cs); |
| 254 | |
| 255 | if (unlikely(csaddr >= tasksize)) { |
| 256 | t->rseq.event.fatal = true; |
| 257 | return false; |
| 258 | } |
| 259 | |
| 260 | if (static_branch_unlikely(&rseq_debug_enabled)) |
| 261 | return rseq_debug_update_user_cs(t, regs, csaddr); |
| 262 | |
| 263 | scoped_user_rw_access(ucs, efault) { |
| 264 | unsafe_get_user(start_ip, &ucs->start_ip, efault); |
| 265 | unsafe_get_user(offset, &ucs->post_commit_offset, efault); |
| 266 | unsafe_get_user(abort_ip, &ucs->abort_ip, efault); |
| 267 | |
| 268 | /* |
| 269 | * No sanity checks. If user space screwed it up, it can |
| 270 | * keep the pieces. That's what debug code is for. |
| 271 | * |
| 272 | * If outside, just clear the critical section. |
| 273 | */ |
| 274 | if (ip - start_ip >= offset) |
| 275 | goto clear; |
| 276 | |
| 277 | /* |
| 278 | * Two requirements for @abort_ip: |
| 279 | * - Must be in user space as x86 IRET would happily return to |
| 280 | * the kernel. |
| 281 | * - The four bytes preceding the instruction at @abort_ip must |
| 282 | * contain the signature. |
| 283 | * |
| 284 | * The latter protects against the following attack vector: |
| 285 | * |
| 286 | * An attacker with limited abilities to write, creates a critical |
| 287 | * section descriptor, sets the abort IP to a library function or |
| 288 | * some other ROP gadget and stores the address of the descriptor |
| 289 | * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP |
| 290 | * protection. |
| 291 | */ |
| 292 | if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) |
| 293 | goto die; |
| 294 | |
| 295 | /* The address is guaranteed to be >= 0 and < TASK_SIZE */ |
| 296 | uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); |
| 297 | unsafe_get_user(usig, uc_sig, efault); |
| 298 | if (unlikely(usig != t->rseq.sig)) |
| 299 | goto die; |
| 300 | |
| 301 | /* Invalidate the critical section */ |
| 302 | unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); |
| 303 | /* Update the instruction pointer */ |
| 304 | instruction_pointer_set(regs, val: (unsigned long)abort_ip); |
| 305 | rseq_stat_inc(rseq_stats.fixup); |
| 306 | break; |
| 307 | clear: |
| 308 | unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); |
| 309 | rseq_stat_inc(rseq_stats.clear); |
| 310 | abort_ip = 0ULL; |
| 311 | } |
| 312 | |
| 313 | if (unlikely(abort_ip)) |
| 314 | rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); |
| 315 | return true; |
| 316 | die: |
| 317 | t->rseq.event.fatal = true; |
| 318 | efault: |
| 319 | return false; |
| 320 | } |
| 321 | |
| 322 | /* |
| 323 | * Updates CPU ID, Node ID and MM CID and reads the critical section |
| 324 | * address, when @csaddr != NULL. This allows to put the ID update and the |
| 325 | * read under the same uaccess region to spare a separate begin/end. |
| 326 | * |
| 327 | * As this is either invoked from a C wrapper with @csaddr = NULL or from |
| 328 | * the fast path code with a valid pointer, a clever compiler should be |
| 329 | * able to optimize the read out. Spares a duplicate implementation. |
| 330 | * |
| 331 | * Returns true, if the operation was successful, false otherwise. |
| 332 | * |
| 333 | * In the failure case task::rseq_event::fatal is set when invalid data |
| 334 | * was found on debug kernels. It's clear when the failure was an unresolved page |
| 335 | * fault. |
| 336 | * |
| 337 | * If inlined into the exit to user path with interrupts disabled, the |
| 338 | * caller has to protect against page faults with pagefault_disable(). |
| 339 | * |
| 340 | * In preemptible task context this would be counterproductive as the page |
| 341 | * faults could not be fully resolved. As a consequence unresolved page |
| 342 | * faults in task context are fatal too. |
| 343 | */ |
| 344 | static rseq_inline |
| 345 | bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, |
| 346 | u32 node_id, u64 *csaddr) |
| 347 | { |
| 348 | struct rseq __user *rseq = t->rseq.usrptr; |
| 349 | |
| 350 | if (static_branch_unlikely(&rseq_debug_enabled)) { |
| 351 | if (!rseq_debug_validate_ids(t)) |
| 352 | return false; |
| 353 | } |
| 354 | |
| 355 | scoped_user_rw_access(rseq, efault) { |
| 356 | unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); |
| 357 | unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); |
| 358 | unsafe_put_user(node_id, &rseq->node_id, efault); |
| 359 | unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); |
| 360 | if (csaddr) |
| 361 | unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); |
| 362 | } |
| 363 | |
| 364 | /* Cache the new values */ |
| 365 | t->rseq.ids.cpu_cid = ids->cpu_cid; |
| 366 | rseq_stat_inc(rseq_stats.ids); |
| 367 | rseq_trace_update(t, ids); |
| 368 | return true; |
| 369 | efault: |
| 370 | return false; |
| 371 | } |
| 372 | |
| 373 | /* |
| 374 | * Update user space with new IDs and conditionally check whether the task |
| 375 | * is in a critical section. |
| 376 | */ |
| 377 | static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, |
| 378 | struct rseq_ids *ids, u32 node_id) |
| 379 | { |
| 380 | u64 csaddr; |
| 381 | |
| 382 | if (!rseq_set_ids_get_csaddr(t, ids, node_id, csaddr: &csaddr)) |
| 383 | return false; |
| 384 | |
| 385 | /* |
| 386 | * On architectures which utilize the generic entry code this |
| 387 | * allows to skip the critical section when the entry was not from |
| 388 | * a user space interrupt, unless debug mode is enabled. |
| 389 | */ |
| 390 | if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { |
| 391 | if (!static_branch_unlikely(&rseq_debug_enabled)) { |
| 392 | if (likely(!t->rseq.event.user_irq)) |
| 393 | return true; |
| 394 | } |
| 395 | } |
| 396 | if (likely(!csaddr)) |
| 397 | return true; |
| 398 | /* Sigh, this really needs to do work */ |
| 399 | return rseq_update_user_cs(t, regs, csaddr); |
| 400 | } |
| 401 | |
| 402 | /* |
| 403 | * If you want to use this then convert your architecture to the generic |
| 404 | * entry code. I'm tired of building workarounds for people who can't be |
| 405 | * bothered to make the maintenance of generic infrastructure less |
| 406 | * burdensome. Just sucking everything into the architecture code and |
| 407 | * thereby making others chase the horrible hacks and keep them working is |
| 408 | * neither acceptable nor sustainable. |
| 409 | */ |
| 410 | #ifdef CONFIG_GENERIC_ENTRY |
| 411 | |
| 412 | /* |
| 413 | * This is inlined into the exit path because: |
| 414 | * |
| 415 | * 1) It's a one time comparison in the fast path when there is no event to |
| 416 | * handle |
| 417 | * |
| 418 | * 2) The access to the user space rseq memory (TLS) is unlikely to fault |
| 419 | * so the straight inline operation is: |
| 420 | * |
| 421 | * - Four 32-bit stores only if CPU ID/ MM CID need to be updated |
| 422 | * - One 64-bit load to retrieve the critical section address |
| 423 | * |
| 424 | * 3) In the unlikely case that the critical section address is != NULL: |
| 425 | * |
| 426 | * - One 64-bit load to retrieve the start IP |
| 427 | * - One 64-bit load to retrieve the offset for calculating the end |
| 428 | * - One 64-bit load to retrieve the abort IP |
| 429 | * - One 64-bit load to retrieve the signature |
| 430 | * - One store to clear the critical section address |
| 431 | * |
| 432 | * The non-debug case implements only the minimal required checking. It |
| 433 | * provides protection against a rogue abort IP in kernel space, which |
| 434 | * would be exploitable at least on x86, and also against a rogue CS |
| 435 | * descriptor by checking the signature at the abort IP. Any fallout from |
| 436 | * invalid critical section descriptors is a user space problem. The debug |
| 437 | * case provides the full set of checks and terminates the task if a |
| 438 | * condition is not met. |
| 439 | * |
| 440 | * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and |
| 441 | * tells the caller to loop back into exit_to_user_mode_loop(). The rseq |
| 442 | * slow path there will handle the failure. |
| 443 | */ |
| 444 | static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) |
| 445 | { |
| 446 | /* |
| 447 | * Page faults need to be disabled as this is called with |
| 448 | * interrupts disabled |
| 449 | */ |
| 450 | guard(pagefault)(); |
| 451 | if (likely(!t->rseq.event.ids_changed)) { |
| 452 | struct rseq __user *rseq = t->rseq.usrptr; |
| 453 | /* |
| 454 | * If IDs have not changed rseq_event::user_irq must be true |
| 455 | * See rseq_sched_switch_event(). |
| 456 | */ |
| 457 | u64 csaddr; |
| 458 | |
| 459 | if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) |
| 460 | return false; |
| 461 | |
| 462 | if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { |
| 463 | if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) |
| 464 | return false; |
| 465 | } |
| 466 | return true; |
| 467 | } |
| 468 | |
| 469 | struct rseq_ids ids = { |
| 470 | .cpu_id = task_cpu(p: t), |
| 471 | .mm_cid = task_mm_cid(t), |
| 472 | }; |
| 473 | u32 node_id = cpu_to_node(cpu: ids.cpu_id); |
| 474 | |
| 475 | return rseq_update_usr(t, regs, ids: &ids, node_id); |
| 476 | } |
| 477 | |
| 478 | static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) |
| 479 | { |
| 480 | struct task_struct *t = current; |
| 481 | |
| 482 | /* |
| 483 | * If the task did not go through schedule or got the flag enforced |
| 484 | * by the rseq syscall or execve, then nothing to do here. |
| 485 | * |
| 486 | * CPU ID and MM CID can only change when going through a context |
| 487 | * switch. |
| 488 | * |
| 489 | * rseq_sched_switch_event() sets the rseq_event::sched_switch bit |
| 490 | * only when rseq_event::has_rseq is true. That conditional is |
| 491 | * required to avoid setting the TIF bit if RSEQ is not registered |
| 492 | * for a task. rseq_event::sched_switch is cleared when RSEQ is |
| 493 | * unregistered by a task so it's sufficient to check for the |
| 494 | * sched_switch bit alone. |
| 495 | * |
| 496 | * A sane compiler requires three instructions for the nothing to do |
| 497 | * case including clearing the events, but your mileage might vary. |
| 498 | */ |
| 499 | if (unlikely((t->rseq.event.sched_switch))) { |
| 500 | rseq_stat_inc(rseq_stats.fastpath); |
| 501 | |
| 502 | if (unlikely(!rseq_exit_user_update(regs, t))) |
| 503 | return true; |
| 504 | } |
| 505 | /* Clear state so next entry starts from a clean slate */ |
| 506 | t->rseq.event.events = 0; |
| 507 | return false; |
| 508 | } |
| 509 | |
| 510 | /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ |
| 511 | #ifdef CONFIG_HAVE_GENERIC_TIF_BITS |
| 512 | static __always_inline bool test_tif_rseq(unsigned long ti_work) |
| 513 | { |
| 514 | return ti_work & _TIF_RSEQ; |
| 515 | } |
| 516 | |
| 517 | static __always_inline void clear_tif_rseq(void) |
| 518 | { |
| 519 | static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); |
| 520 | clear_thread_flag(TIF_RSEQ); |
| 521 | } |
| 522 | #else |
| 523 | static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } |
| 524 | static __always_inline void clear_tif_rseq(void) { } |
| 525 | #endif |
| 526 | |
| 527 | static __always_inline bool |
| 528 | rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) |
| 529 | { |
| 530 | if (likely(!test_tif_rseq(ti_work))) |
| 531 | return false; |
| 532 | |
| 533 | if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { |
| 534 | current->rseq.event.slowpath = true; |
| 535 | set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); |
| 536 | return true; |
| 537 | } |
| 538 | |
| 539 | clear_tif_rseq(); |
| 540 | return false; |
| 541 | } |
| 542 | |
| 543 | #else /* CONFIG_GENERIC_ENTRY */ |
| 544 | static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) |
| 545 | { |
| 546 | return false; |
| 547 | } |
| 548 | #endif /* !CONFIG_GENERIC_ENTRY */ |
| 549 | |
| 550 | static __always_inline void rseq_syscall_exit_to_user_mode(void) |
| 551 | { |
| 552 | struct rseq_event *ev = ¤t->rseq.event; |
| 553 | |
| 554 | rseq_stat_inc(rseq_stats.exit); |
| 555 | |
| 556 | /* Needed to remove the store for the !lockdep case */ |
| 557 | if (IS_ENABLED(CONFIG_LOCKDEP)) { |
| 558 | WARN_ON_ONCE(ev->sched_switch); |
| 559 | ev->events = 0; |
| 560 | } |
| 561 | } |
| 562 | |
| 563 | static __always_inline void rseq_irqentry_exit_to_user_mode(void) |
| 564 | { |
| 565 | struct rseq_event *ev = ¤t->rseq.event; |
| 566 | |
| 567 | rseq_stat_inc(rseq_stats.exit); |
| 568 | |
| 569 | lockdep_assert_once(!ev->sched_switch); |
| 570 | |
| 571 | /* |
| 572 | * Ensure that event (especially user_irq) is cleared when the |
| 573 | * interrupt did not result in a schedule and therefore the |
| 574 | * rseq processing could not clear it. |
| 575 | */ |
| 576 | ev->events = 0; |
| 577 | } |
| 578 | |
| 579 | /* Required to keep ARM64 working */ |
| 580 | static __always_inline void rseq_exit_to_user_mode_legacy(void) |
| 581 | { |
| 582 | struct rseq_event *ev = ¤t->rseq.event; |
| 583 | |
| 584 | rseq_stat_inc(rseq_stats.exit); |
| 585 | |
| 586 | if (static_branch_unlikely(&rseq_debug_enabled)) |
| 587 | WARN_ON_ONCE(ev->sched_switch); |
| 588 | |
| 589 | /* |
| 590 | * Ensure that event (especially user_irq) is cleared when the |
| 591 | * interrupt did not result in a schedule and therefore the |
| 592 | * rseq processing did not clear it. |
| 593 | */ |
| 594 | ev->events = 0; |
| 595 | } |
| 596 | |
| 597 | void __rseq_debug_syscall_return(struct pt_regs *regs); |
| 598 | |
| 599 | static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) |
| 600 | { |
| 601 | if (static_branch_unlikely(&rseq_debug_enabled)) |
| 602 | __rseq_debug_syscall_return(regs); |
| 603 | } |
| 604 | #else /* CONFIG_RSEQ */ |
| 605 | static inline void rseq_note_user_irq_entry(void) { } |
| 606 | static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) |
| 607 | { |
| 608 | return false; |
| 609 | } |
| 610 | static inline void rseq_syscall_exit_to_user_mode(void) { } |
| 611 | static inline void rseq_irqentry_exit_to_user_mode(void) { } |
| 612 | static inline void rseq_exit_to_user_mode_legacy(void) { } |
| 613 | static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } |
| 614 | #endif /* !CONFIG_RSEQ */ |
| 615 | |
| 616 | #endif /* _LINUX_RSEQ_ENTRY_H */ |
| 617 | |