include/linux/preempt.h

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef __LINUX_PREEMPT_H
0003 #define __LINUX_PREEMPT_H
0004
0005 /*
0006  * include/linux/preempt.h - macros for accessing and manipulating
0007  * preempt_count (used for kernel preemption, interrupt count, etc.)
0008  */
0009
0010 #include <linux/linkage.h>
0011 #include <linux/list.h>
0012
0013 /*
0014  * We put the hardirq and softirq counter into the preemption
0015  * counter. The bitmask has the following meaning:
0016  *
0017  * - bits 0-7 are the preemption count (max preemption depth: 256)
0018  * - bits 8-15 are the softirq count (max # of softirqs: 256)
0019  *
0020  * The hardirq count could in theory be the same as the number of
0021  * interrupts in the system, but we run all interrupt handlers with
0022  * interrupts disabled, so we cannot have nesting interrupts. Though
0023  * there are a few palaeontologic drivers which reenable interrupts in
0024  * the handler, so we need more than one bit here.
0025  *
0026  *         PREEMPT_MASK:    0x000000ff
0027  *         SOFTIRQ_MASK:    0x0000ff00
0028  *         HARDIRQ_MASK:    0x000f0000
0029  *             NMI_MASK:    0x00f00000
0030  * PREEMPT_NEED_RESCHED:    0x80000000
0031  */
0032 #define PREEMPT_BITS    8
0033 #define SOFTIRQ_BITS    8
0034 #define HARDIRQ_BITS    4
0035 #define NMI_BITS    4
0036
0037 #define PREEMPT_SHIFT   0
0038 #define SOFTIRQ_SHIFT   (PREEMPT_SHIFT + PREEMPT_BITS)
0039 #define HARDIRQ_SHIFT   (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
0040 #define NMI_SHIFT   (HARDIRQ_SHIFT + HARDIRQ_BITS)
0041
0042 #define __IRQ_MASK(x)   ((1UL << (x))-1)
0043
0044 #define PREEMPT_MASK    (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
0045 #define SOFTIRQ_MASK    (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
0046 #define HARDIRQ_MASK    (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
0047 #define NMI_MASK    (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
0048
0049 #define PREEMPT_OFFSET  (1UL << PREEMPT_SHIFT)
0050 #define SOFTIRQ_OFFSET  (1UL << SOFTIRQ_SHIFT)
0051 #define HARDIRQ_OFFSET  (1UL << HARDIRQ_SHIFT)
0052 #define NMI_OFFSET  (1UL << NMI_SHIFT)
0053
0054 #define SOFTIRQ_DISABLE_OFFSET  (2 * SOFTIRQ_OFFSET)
0055
0056 #define PREEMPT_DISABLED    (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
0057
0058 /*
0059  * Disable preemption until the scheduler is running -- use an unconditional
0060  * value so that it also works on !PREEMPT_COUNT kernels.
0061  *
0062  * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
0063  */
0064 #define INIT_PREEMPT_COUNT  PREEMPT_OFFSET
0065
0066 /*
0067  * Initial preempt_count value; reflects the preempt_count schedule invariant
0068  * which states that during context switches:
0069  *
0070  *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
0071  *
0072  * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
0073  * Note: See finish_task_switch().
0074  */
0075 #define FORK_PREEMPT_COUNT  (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
0076
0077 /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
0078 #include <asm/preempt.h>
0079
0080 /**
0081  * interrupt_context_level - return interrupt context level
0082  *
0083  * Returns the current interrupt context level.
0084  *  0 - normal context
0085  *  1 - softirq context
0086  *  2 - hardirq context
0087  *  3 - NMI context
0088  */
0089 static __always_inline unsigned char interrupt_context_level(void)
0090 {
0091     unsigned long pc = preempt_count();
0092     unsigned char level = 0;
0093
0094     level += !!(pc & (NMI_MASK));
0095     level += !!(pc & (NMI_MASK | HARDIRQ_MASK));
0096     level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
0097
0098     return level;
0099 }
0100
0101 #define nmi_count() (preempt_count() & NMI_MASK)
0102 #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
0103 #ifdef CONFIG_PREEMPT_RT
0104 # define softirq_count()    (current->softirq_disable_cnt & SOFTIRQ_MASK)
0105 #else
0106 # define softirq_count()    (preempt_count() & SOFTIRQ_MASK)
0107 #endif
0108 #define irq_count() (nmi_count() | hardirq_count() | softirq_count())
0109
0110 /*
0111  * Macros to retrieve the current execution context:
0112  *
0113  * in_nmi()     - We're in NMI context
0114  * in_hardirq()     - We're in hard IRQ context
0115  * in_serving_softirq() - We're in softirq context
0116  * in_task()        - We're in task context
0117  */
0118 #define in_nmi()        (nmi_count())
0119 #define in_hardirq()        (hardirq_count())
0120 #define in_serving_softirq()    (softirq_count() & SOFTIRQ_OFFSET)
0121 #define in_task()       (!(in_nmi() | in_hardirq() | in_serving_softirq()))
0122
0123 /*
0124  * The following macros are deprecated and should not be used in new code:
0125  * in_irq()       - Obsolete version of in_hardirq()
0126  * in_softirq()   - We have BH disabled, or are processing softirqs
0127  * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
0128  */
0129 #define in_irq()        (hardirq_count())
0130 #define in_softirq()        (softirq_count())
0131 #define in_interrupt()      (irq_count())
0132
0133 /*
0134  * The preempt_count offset after preempt_disable();
0135  */
0136 #if defined(CONFIG_PREEMPT_COUNT)
0137 # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET
0138 #else
0139 # define PREEMPT_DISABLE_OFFSET 0
0140 #endif
0141
0142 /*
0143  * The preempt_count offset after spin_lock()
0144  */
0145 #if !defined(CONFIG_PREEMPT_RT)
0146 #define PREEMPT_LOCK_OFFSET     PREEMPT_DISABLE_OFFSET
0147 #else
0148 /* Locks on RT do not disable preemption */
0149 #define PREEMPT_LOCK_OFFSET     0
0150 #endif
0151
0152 /*
0153  * The preempt_count offset needed for things like:
0154  *
0155  *  spin_lock_bh()
0156  *
0157  * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
0158  * softirqs, such that unlock sequences of:
0159  *
0160  *  spin_unlock();
0161  *  local_bh_enable();
0162  *
0163  * Work as expected.
0164  */
0165 #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)
0166
0167 /*
0168  * Are we running in atomic context?  WARNING: this macro cannot
0169  * always detect atomic context; in particular, it cannot know about
0170  * held spinlocks in non-preemptible kernels.  Thus it should not be
0171  * used in the general case to determine whether sleeping is possible.
0172  * Do not use in_atomic() in driver code.
0173  */
0174 #define in_atomic() (preempt_count() != 0)
0175
0176 /*
0177  * Check whether we were atomic before we did preempt_disable():
0178  * (used by the scheduler)
0179  */
0180 #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
0181
0182 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
0183 extern void preempt_count_add(int val);
0184 extern void preempt_count_sub(int val);
0185 #define preempt_count_dec_and_test() \
0186     ({ preempt_count_sub(1); should_resched(0); })
0187 #else
0188 #define preempt_count_add(val)  __preempt_count_add(val)
0189 #define preempt_count_sub(val)  __preempt_count_sub(val)
0190 #define preempt_count_dec_and_test() __preempt_count_dec_and_test()
0191 #endif
0192
0193 #define __preempt_count_inc() __preempt_count_add(1)
0194 #define __preempt_count_dec() __preempt_count_sub(1)
0195
0196 #define preempt_count_inc() preempt_count_add(1)
0197 #define preempt_count_dec() preempt_count_sub(1)
0198
0199 #ifdef CONFIG_PREEMPT_COUNT
0200
0201 #define preempt_disable() \
0202 do { \
0203     preempt_count_inc(); \
0204     barrier(); \
0205 } while (0)
0206
0207 #define sched_preempt_enable_no_resched() \
0208 do { \
0209     barrier(); \
0210     preempt_count_dec(); \
0211 } while (0)
0212
0213 #define preempt_enable_no_resched() sched_preempt_enable_no_resched()
0214
0215 #define preemptible()   (preempt_count() == 0 && !irqs_disabled())
0216
0217 #ifdef CONFIG_PREEMPTION
0218 #define preempt_enable() \
0219 do { \
0220     barrier(); \
0221     if (unlikely(preempt_count_dec_and_test())) \
0222         __preempt_schedule(); \
0223 } while (0)
0224
0225 #define preempt_enable_notrace() \
0226 do { \
0227     barrier(); \
0228     if (unlikely(__preempt_count_dec_and_test())) \
0229         __preempt_schedule_notrace(); \
0230 } while (0)
0231
0232 #define preempt_check_resched() \
0233 do { \
0234     if (should_resched(0)) \
0235         __preempt_schedule(); \
0236 } while (0)
0237
0238 #else /* !CONFIG_PREEMPTION */
0239 #define preempt_enable() \
0240 do { \
0241     barrier(); \
0242     preempt_count_dec(); \
0243 } while (0)
0244
0245 #define preempt_enable_notrace() \
0246 do { \
0247     barrier(); \
0248     __preempt_count_dec(); \
0249 } while (0)
0250
0251 #define preempt_check_resched() do { } while (0)
0252 #endif /* CONFIG_PREEMPTION */
0253
0254 #define preempt_disable_notrace() \
0255 do { \
0256     __preempt_count_inc(); \
0257     barrier(); \
0258 } while (0)
0259
0260 #define preempt_enable_no_resched_notrace() \
0261 do { \
0262     barrier(); \
0263     __preempt_count_dec(); \
0264 } while (0)
0265
0266 #else /* !CONFIG_PREEMPT_COUNT */
0267
0268 /*
0269  * Even if we don't have any preemption, we need preempt disable/enable
0270  * to be barriers, so that we don't have things like get_user/put_user
0271  * that can cause faults and scheduling migrate into our preempt-protected
0272  * region.
0273  */
0274 #define preempt_disable()           barrier()
0275 #define sched_preempt_enable_no_resched()   barrier()
0276 #define preempt_enable_no_resched()     barrier()
0277 #define preempt_enable()            barrier()
0278 #define preempt_check_resched()         do { } while (0)
0279
0280 #define preempt_disable_notrace()       barrier()
0281 #define preempt_enable_no_resched_notrace() barrier()
0282 #define preempt_enable_notrace()        barrier()
0283 #define preemptible()               0
0284
0285 #endif /* CONFIG_PREEMPT_COUNT */
0286
0287 #ifdef MODULE
0288 /*
0289  * Modules have no business playing preemption tricks.
0290  */
0291 #undef sched_preempt_enable_no_resched
0292 #undef preempt_enable_no_resched
0293 #undef preempt_enable_no_resched_notrace
0294 #undef preempt_check_resched
0295 #endif
0296
0297 #define preempt_set_need_resched() \
0298 do { \
0299     set_preempt_need_resched(); \
0300 } while (0)
0301 #define preempt_fold_need_resched() \
0302 do { \
0303     if (tif_need_resched()) \
0304         set_preempt_need_resched(); \
0305 } while (0)
0306
0307 #ifdef CONFIG_PREEMPT_NOTIFIERS
0308
0309 struct preempt_notifier;
0310
0311 /**
0312  * preempt_ops - notifiers called when a task is preempted and rescheduled
0313  * @sched_in: we're about to be rescheduled:
0314  *    notifier: struct preempt_notifier for the task being scheduled
0315  *    cpu:  cpu we're scheduled on
0316  * @sched_out: we've just been preempted
0317  *    notifier: struct preempt_notifier for the task being preempted
0318  *    next: the task that's kicking us out
0319  *
0320  * Please note that sched_in and out are called under different
0321  * contexts.  sched_out is called with rq lock held and irq disabled
0322  * while sched_in is called without rq lock and irq enabled.  This
0323  * difference is intentional and depended upon by its users.
0324  */
0325 struct preempt_ops {
0326     void (*sched_in)(struct preempt_notifier *notifier, int cpu);
0327     void (*sched_out)(struct preempt_notifier *notifier,
0328               struct task_struct *next);
0329 };
0330
0331 /**
0332  * preempt_notifier - key for installing preemption notifiers
0333  * @link: internal use
0334  * @ops: defines the notifier functions to be called
0335  *
0336  * Usually used in conjunction with container_of().
0337  */
0338 struct preempt_notifier {
0339     struct hlist_node link;
0340     struct preempt_ops *ops;
0341 };
0342
0343 void preempt_notifier_inc(void);
0344 void preempt_notifier_dec(void);
0345 void preempt_notifier_register(struct preempt_notifier *notifier);
0346 void preempt_notifier_unregister(struct preempt_notifier *notifier);
0347
0348 static inline void preempt_notifier_init(struct preempt_notifier *notifier,
0349                      struct preempt_ops *ops)
0350 {
0351     INIT_HLIST_NODE(&notifier->link);
0352     notifier->ops = ops;
0353 }
0354
0355 #endif
0356
0357 #ifdef CONFIG_SMP
0358
0359 /*
0360  * Migrate-Disable and why it is undesired.
0361  *
0362  * When a preempted task becomes elegible to run under the ideal model (IOW it
0363  * becomes one of the M highest priority tasks), it might still have to wait
0364  * for the preemptee's migrate_disable() section to complete. Thereby suffering
0365  * a reduction in bandwidth in the exact duration of the migrate_disable()
0366  * section.
0367  *
0368  * Per this argument, the change from preempt_disable() to migrate_disable()
0369  * gets us:
0370  *
0371  * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
0372  *   it would have had to wait for the lower priority task.
0373  *
0374  * - a lower priority tasks; which under preempt_disable() could've instantly
0375  *   migrated away when another CPU becomes available, is now constrained
0376  *   by the ability to push the higher priority task away, which might itself be
0377  *   in a migrate_disable() section, reducing it's available bandwidth.
0378  *
0379  * IOW it trades latency / moves the interference term, but it stays in the
0380  * system, and as long as it remains unbounded, the system is not fully
0381  * deterministic.
0382  *
0383  *
0384  * The reason we have it anyway.
0385  *
0386  * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
0387  * number of primitives into becoming preemptible, they would also allow
0388  * migration. This turns out to break a bunch of per-cpu usage. To this end,
0389  * all these primitives employ migirate_disable() to restore this implicit
0390  * assumption.
0391  *
0392  * This is a 'temporary' work-around at best. The correct solution is getting
0393  * rid of the above assumptions and reworking the code to employ explicit
0394  * per-cpu locking or short preempt-disable regions.
0395  *
0396  * The end goal must be to get rid of migrate_disable(), alternatively we need
0397  * a schedulability theory that does not depend on abritrary migration.
0398  *
0399  *
0400  * Notes on the implementation.
0401  *
0402  * The implementation is particularly tricky since existing code patterns
0403  * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
0404  * This means that it cannot use cpus_read_lock() to serialize against hotplug,
0405  * nor can it easily migrate itself into a pending affinity mask change on
0406  * migrate_enable().
0407  *
0408  *
0409  * Note: even non-work-conserving schedulers like semi-partitioned depends on
0410  *       migration, so migrate_disable() is not only a problem for
0411  *       work-conserving schedulers.
0412  *
0413  */
0414 extern void migrate_disable(void);
0415 extern void migrate_enable(void);
0416
0417 #else
0418
0419 static inline void migrate_disable(void) { }
0420 static inline void migrate_enable(void) { }
0421
0422 #endif /* CONFIG_SMP */
0423
0424 #endif /* __LINUX_PREEMPT_H */