| 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* |
| 3 | * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst |
| 4 | * |
| 5 | * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. |
| 6 | * Copyright (c) 2022 Tejun Heo <tj@kernel.org> |
| 7 | * Copyright (c) 2022 David Vernet <dvernet@meta.com> |
| 8 | */ |
| 9 | #ifndef _LINUX_SCHED_EXT_H |
| 10 | #define _LINUX_SCHED_EXT_H |
| 11 | |
| 12 | #ifdef CONFIG_SCHED_CLASS_EXT |
| 13 | |
| 14 | #include <linux/llist.h> |
| 15 | #include <linux/rhashtable-types.h> |
| 16 | |
| 17 | enum scx_public_consts { |
| 18 | SCX_OPS_NAME_LEN = 128, |
| 19 | |
| 20 | /* |
| 21 | * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses |
| 22 | * to set the slice for a task that is selected for execution. |
| 23 | * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice |
| 24 | * refill has been triggered. |
| 25 | * |
| 26 | * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass |
| 27 | * mode. As making forward progress for all tasks is the main goal of |
| 28 | * the bypass mode, a shorter slice is used. |
| 29 | */ |
| 30 | SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ |
| 31 | SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ |
| 32 | SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ |
| 33 | }; |
| 34 | |
| 35 | /* |
| 36 | * DSQ (dispatch queue) IDs are 64bit of the format: |
| 37 | * |
| 38 | * Bits: [63] [62 .. 0] |
| 39 | * [ B] [ ID ] |
| 40 | * |
| 41 | * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs |
| 42 | * ID: 63 bit ID |
| 43 | * |
| 44 | * Built-in IDs: |
| 45 | * |
| 46 | * Bits: [63] [62] [61..32] [31 .. 0] |
| 47 | * [ 1] [ L] [ R ] [ V ] |
| 48 | * |
| 49 | * 1: 1 for built-in DSQs. |
| 50 | * L: 1 for LOCAL_ON DSQ IDs, 0 for others |
| 51 | * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. |
| 52 | */ |
| 53 | enum scx_dsq_id_flags { |
| 54 | SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, |
| 55 | SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, |
| 56 | |
| 57 | SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, |
| 58 | SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, |
| 59 | SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, |
| 60 | SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, |
| 61 | SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, |
| 62 | SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, |
| 63 | }; |
| 64 | |
| 65 | /* |
| 66 | * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered |
| 67 | * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to |
| 68 | * buffer between the scheduler core and the BPF scheduler. See the |
| 69 | * documentation for more details. |
| 70 | */ |
| 71 | struct scx_dispatch_q { |
| 72 | raw_spinlock_t lock; |
| 73 | struct task_struct __rcu *first_task; /* lockless peek at head */ |
| 74 | struct list_head list; /* tasks in dispatch order */ |
| 75 | struct rb_root priq; /* used to order by p->scx.dsq_vtime */ |
| 76 | u32 nr; |
| 77 | u32 seq; /* used by BPF iter */ |
| 78 | u64 id; |
| 79 | struct rhash_head hash_node; |
| 80 | struct llist_node free_node; |
| 81 | struct rcu_head rcu; |
| 82 | }; |
| 83 | |
| 84 | /* scx_entity.flags */ |
| 85 | enum scx_ent_flags { |
| 86 | SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ |
| 87 | SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ |
| 88 | SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ |
| 89 | |
| 90 | SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ |
| 91 | SCX_TASK_STATE_BITS = 2, |
| 92 | SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, |
| 93 | |
| 94 | SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ |
| 95 | }; |
| 96 | |
| 97 | /* scx_entity.flags & SCX_TASK_STATE_MASK */ |
| 98 | enum scx_task_state { |
| 99 | SCX_TASK_NONE, /* ops.init_task() not called yet */ |
| 100 | SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ |
| 101 | SCX_TASK_READY, /* fully initialized, but not in sched_ext */ |
| 102 | SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ |
| 103 | |
| 104 | SCX_TASK_NR_STATES, |
| 105 | }; |
| 106 | |
| 107 | /* scx_entity.dsq_flags */ |
| 108 | enum scx_ent_dsq_flags { |
| 109 | SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ |
| 110 | }; |
| 111 | |
| 112 | /* |
| 113 | * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from |
| 114 | * everywhere and the following bits track which kfunc sets are currently |
| 115 | * allowed for %current. This simple per-task tracking works because SCX ops |
| 116 | * nest in a limited way. BPF will likely implement a way to allow and disallow |
| 117 | * kfuncs depending on the calling context which will replace this manual |
| 118 | * mechanism. See scx_kf_allow(). |
| 119 | */ |
| 120 | enum scx_kf_mask { |
| 121 | SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ |
| 122 | /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ |
| 123 | SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ |
| 124 | /* |
| 125 | * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and |
| 126 | * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be |
| 127 | * nested inside DISPATCH. |
| 128 | */ |
| 129 | SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ |
| 130 | SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ |
| 131 | SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ |
| 132 | SCX_KF_REST = 1 << 4, /* other rq-locked operations */ |
| 133 | |
| 134 | __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | |
| 135 | SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, |
| 136 | __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, |
| 137 | }; |
| 138 | |
| 139 | enum scx_dsq_lnode_flags { |
| 140 | SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, |
| 141 | |
| 142 | /* high 16 bits can be for iter cursor flags */ |
| 143 | __SCX_DSQ_LNODE_PRIV_SHIFT = 16, |
| 144 | }; |
| 145 | |
| 146 | struct scx_dsq_list_node { |
| 147 | struct list_head node; |
| 148 | u32 flags; |
| 149 | u32 priv; /* can be used by iter cursor */ |
| 150 | }; |
| 151 | |
| 152 | #define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ |
| 153 | (struct scx_dsq_list_node) { \ |
| 154 | .node = LIST_HEAD_INIT((__node).node), \ |
| 155 | .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ |
| 156 | .priv = (__priv), \ |
| 157 | } |
| 158 | |
| 159 | /* |
| 160 | * The following is embedded in task_struct and contains all fields necessary |
| 161 | * for a task to be scheduled by SCX. |
| 162 | */ |
| 163 | struct sched_ext_entity { |
| 164 | struct scx_dispatch_q *dsq; |
| 165 | struct scx_dsq_list_node dsq_list; /* dispatch order */ |
| 166 | struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ |
| 167 | u32 dsq_seq; |
| 168 | u32 dsq_flags; /* protected by DSQ lock */ |
| 169 | u32 flags; /* protected by rq lock */ |
| 170 | u32 weight; |
| 171 | s32 sticky_cpu; |
| 172 | s32 holding_cpu; |
| 173 | s32 selected_cpu; |
| 174 | u32 kf_mask; /* see scx_kf_mask above */ |
| 175 | struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ |
| 176 | atomic_long_t ops_state; |
| 177 | |
| 178 | struct list_head runnable_node; /* rq->scx.runnable_list */ |
| 179 | unsigned long runnable_at; |
| 180 | |
| 181 | #ifdef CONFIG_SCHED_CORE |
| 182 | u64 core_sched_at; /* see scx_prio_less() */ |
| 183 | #endif |
| 184 | u64 ddsp_dsq_id; |
| 185 | u64 ddsp_enq_flags; |
| 186 | |
| 187 | /* BPF scheduler modifiable fields */ |
| 188 | |
| 189 | /* |
| 190 | * Runtime budget in nsecs. This is usually set through |
| 191 | * scx_bpf_dsq_insert() but can also be modified directly by the BPF |
| 192 | * scheduler. Automatically decreased by SCX as the task executes. On |
| 193 | * depletion, a scheduling event is triggered. |
| 194 | * |
| 195 | * This value is cleared to zero if the task is preempted by |
| 196 | * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the |
| 197 | * task ran. Use p->se.sum_exec_runtime instead. |
| 198 | */ |
| 199 | u64 slice; |
| 200 | |
| 201 | /* |
| 202 | * Used to order tasks when dispatching to the vtime-ordered priority |
| 203 | * queue of a dsq. This is usually set through |
| 204 | * scx_bpf_dsq_insert_vtime() but can also be modified directly by the |
| 205 | * BPF scheduler. Modifying it while a task is queued on a dsq may |
| 206 | * mangle the ordering and is not recommended. |
| 207 | */ |
| 208 | u64 dsq_vtime; |
| 209 | |
| 210 | /* |
| 211 | * If set, reject future sched_setscheduler(2) calls updating the policy |
| 212 | * to %SCHED_EXT with -%EACCES. |
| 213 | * |
| 214 | * Can be set from ops.init_task() while the BPF scheduler is being |
| 215 | * loaded (!scx_init_task_args->fork). If set and the task's policy is |
| 216 | * already %SCHED_EXT, the task's policy is rejected and forcefully |
| 217 | * reverted to %SCHED_NORMAL. The number of such events are reported |
| 218 | * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag |
| 219 | * during fork is not allowed. |
| 220 | */ |
| 221 | bool disallow; /* reject switching into SCX */ |
| 222 | |
| 223 | /* cold fields */ |
| 224 | #ifdef CONFIG_EXT_GROUP_SCHED |
| 225 | struct cgroup *cgrp_moving_from; |
| 226 | #endif |
| 227 | struct list_head tasks_node; |
| 228 | }; |
| 229 | |
| 230 | void sched_ext_dead(struct task_struct *p); |
| 231 | void print_scx_info(const char *log_lvl, struct task_struct *p); |
| 232 | void scx_softlockup(u32 dur_s); |
| 233 | bool scx_hardlockup(int cpu); |
| 234 | bool scx_rcu_cpu_stall(void); |
| 235 | |
| 236 | #else /* !CONFIG_SCHED_CLASS_EXT */ |
| 237 | |
| 238 | static inline void sched_ext_dead(struct task_struct *p) {} |
| 239 | static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} |
| 240 | static inline void scx_softlockup(u32 dur_s) {} |
| 241 | static inline bool scx_hardlockup(int cpu) { return false; } |
| 242 | static inline bool scx_rcu_cpu_stall(void) { return false; } |
| 243 | |
| 244 | #endif /* CONFIG_SCHED_CLASS_EXT */ |
| 245 | |
| 246 | struct scx_task_group { |
| 247 | #ifdef CONFIG_EXT_GROUP_SCHED |
| 248 | u32 flags; /* SCX_TG_* */ |
| 249 | u32 weight; |
| 250 | u64 bw_period_us; |
| 251 | u64 bw_quota_us; |
| 252 | u64 bw_burst_us; |
| 253 | bool idle; |
| 254 | #endif |
| 255 | }; |
| 256 | |
| 257 | #endif /* _LINUX_SCHED_EXT_H */ |
| 258 | |