![]() |
|
|||
0001 // SPDX-License-Identifier: GPL-2.0-or-later 0002 /* 0003 * Queued spinlock 0004 * 0005 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 0006 * (C) Copyright 2013-2014,2018 Red Hat, Inc. 0007 * (C) Copyright 2015 Intel Corp. 0008 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 0009 * 0010 * Authors: Waiman Long <longman@redhat.com> 0011 * Peter Zijlstra <peterz@infradead.org> 0012 */ 0013 0014 #ifndef _GEN_PV_LOCK_SLOWPATH 0015 0016 #include <linux/smp.h> 0017 #include <linux/bug.h> 0018 #include <linux/cpumask.h> 0019 #include <linux/percpu.h> 0020 #include <linux/hardirq.h> 0021 #include <linux/mutex.h> 0022 #include <linux/prefetch.h> 0023 #include <asm/byteorder.h> 0024 #include <asm/qspinlock.h> 0025 #include <trace/events/lock.h> 0026 0027 /* 0028 * Include queued spinlock statistics code 0029 */ 0030 #include "qspinlock_stat.h" 0031 0032 /* 0033 * The basic principle of a queue-based spinlock can best be understood 0034 * by studying a classic queue-based spinlock implementation called the 0035 * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 0036 * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 0037 * Scott") is available at 0038 * 0039 * https://bugzilla.kernel.org/show_bug.cgi?id=206115 0040 * 0041 * This queued spinlock implementation is based on the MCS lock, however to 0042 * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 0043 * existing API, we must modify it somehow. 0044 * 0045 * In particular; where the traditional MCS lock consists of a tail pointer 0046 * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 0047 * unlock the next pending (next->locked), we compress both these: {tail, 0048 * next->locked} into a single u32 value. 0049 * 0050 * Since a spinlock disables recursion of its own context and there is a limit 0051 * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 0052 * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 0053 * we can encode the tail by combining the 2-bit nesting level with the cpu 0054 * number. With one byte for the lock value and 3 bytes for the tail, only a 0055 * 32-bit word is now needed. Even though we only need 1 bit for the lock, 0056 * we extend it to a full byte to achieve better performance for architectures 0057 * that support atomic byte write. 0058 * 0059 * We also change the first spinner to spin on the lock bit instead of its 0060 * node; whereby avoiding the need to carry a node from lock to unlock, and 0061 * preserving existing lock API. This also makes the unlock code simpler and 0062 * faster. 0063 * 0064 * N.B. The current implementation only supports architectures that allow 0065 * atomic operations on smaller 8-bit and 16-bit data types. 0066 * 0067 */ 0068 0069 #include "mcs_spinlock.h" 0070 #define MAX_NODES 4 0071 0072 /* 0073 * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in 0074 * size and four of them will fit nicely in one 64-byte cacheline. For 0075 * pvqspinlock, however, we need more space for extra data. To accommodate 0076 * that, we insert two more long words to pad it up to 32 bytes. IOW, only 0077 * two of them can fit in a cacheline in this case. That is OK as it is rare 0078 * to have more than 2 levels of slowpath nesting in actual use. We don't 0079 * want to penalize pvqspinlocks to optimize for a rare case in native 0080 * qspinlocks. 0081 */ 0082 struct qnode { 0083 struct mcs_spinlock mcs; 0084 #ifdef CONFIG_PARAVIRT_SPINLOCKS 0085 long reserved[2]; 0086 #endif 0087 }; 0088 0089 /* 0090 * The pending bit spinning loop count. 0091 * This heuristic is used to limit the number of lockword accesses 0092 * made by atomic_cond_read_relaxed when waiting for the lock to 0093 * transition out of the "== _Q_PENDING_VAL" state. We don't spin 0094 * indefinitely because there's no guarantee that we'll make forward 0095 * progress. 0096 */ 0097 #ifndef _Q_PENDING_LOOPS 0098 #define _Q_PENDING_LOOPS 1 0099 #endif 0100 0101 /* 0102 * Per-CPU queue node structures; we can never have more than 4 nested 0103 * contexts: task, softirq, hardirq, nmi. 0104 * 0105 * Exactly fits one 64-byte cacheline on a 64-bit architecture. 0106 * 0107 * PV doubles the storage and uses the second cacheline for PV state. 0108 */ 0109 static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); 0110 0111 /* 0112 * We must be able to distinguish between no-tail and the tail at 0:0, 0113 * therefore increment the cpu number by one. 0114 */ 0115 0116 static inline __pure u32 encode_tail(int cpu, int idx) 0117 { 0118 u32 tail; 0119 0120 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 0121 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 0122 0123 return tail; 0124 } 0125 0126 static inline __pure struct mcs_spinlock *decode_tail(u32 tail) 0127 { 0128 int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 0129 int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 0130 0131 return per_cpu_ptr(&qnodes[idx].mcs, cpu); 0132 } 0133 0134 static inline __pure 0135 struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) 0136 { 0137 return &((struct qnode *)base + idx)->mcs; 0138 } 0139 0140 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 0141 0142 #if _Q_PENDING_BITS == 8 0143 /** 0144 * clear_pending - clear the pending bit. 0145 * @lock: Pointer to queued spinlock structure 0146 * 0147 * *,1,* -> *,0,* 0148 */ 0149 static __always_inline void clear_pending(struct qspinlock *lock) 0150 { 0151 WRITE_ONCE(lock->pending, 0); 0152 } 0153 0154 /** 0155 * clear_pending_set_locked - take ownership and clear the pending bit. 0156 * @lock: Pointer to queued spinlock structure 0157 * 0158 * *,1,0 -> *,0,1 0159 * 0160 * Lock stealing is not allowed if this function is used. 0161 */ 0162 static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 0163 { 0164 WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 0165 } 0166 0167 /* 0168 * xchg_tail - Put in the new queue tail code word & retrieve previous one 0169 * @lock : Pointer to queued spinlock structure 0170 * @tail : The new queue tail code word 0171 * Return: The previous queue tail code word 0172 * 0173 * xchg(lock, tail), which heads an address dependency 0174 * 0175 * p,*,* -> n,*,* ; prev = xchg(lock, node) 0176 */ 0177 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 0178 { 0179 /* 0180 * We can use relaxed semantics since the caller ensures that the 0181 * MCS node is properly initialized before updating the tail. 0182 */ 0183 return (u32)xchg_relaxed(&lock->tail, 0184 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 0185 } 0186 0187 #else /* _Q_PENDING_BITS == 8 */ 0188 0189 /** 0190 * clear_pending - clear the pending bit. 0191 * @lock: Pointer to queued spinlock structure 0192 * 0193 * *,1,* -> *,0,* 0194 */ 0195 static __always_inline void clear_pending(struct qspinlock *lock) 0196 { 0197 atomic_andnot(_Q_PENDING_VAL, &lock->val); 0198 } 0199 0200 /** 0201 * clear_pending_set_locked - take ownership and clear the pending bit. 0202 * @lock: Pointer to queued spinlock structure 0203 * 0204 * *,1,0 -> *,0,1 0205 */ 0206 static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 0207 { 0208 atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 0209 } 0210 0211 /** 0212 * xchg_tail - Put in the new queue tail code word & retrieve previous one 0213 * @lock : Pointer to queued spinlock structure 0214 * @tail : The new queue tail code word 0215 * Return: The previous queue tail code word 0216 * 0217 * xchg(lock, tail) 0218 * 0219 * p,*,* -> n,*,* ; prev = xchg(lock, node) 0220 */ 0221 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 0222 { 0223 u32 old, new, val = atomic_read(&lock->val); 0224 0225 for (;;) { 0226 new = (val & _Q_LOCKED_PENDING_MASK) | tail; 0227 /* 0228 * We can use relaxed semantics since the caller ensures that 0229 * the MCS node is properly initialized before updating the 0230 * tail. 0231 */ 0232 old = atomic_cmpxchg_relaxed(&lock->val, val, new); 0233 if (old == val) 0234 break; 0235 0236 val = old; 0237 } 0238 return old; 0239 } 0240 #endif /* _Q_PENDING_BITS == 8 */ 0241 0242 /** 0243 * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending 0244 * @lock : Pointer to queued spinlock structure 0245 * Return: The previous lock value 0246 * 0247 * *,*,* -> *,1,* 0248 */ 0249 #ifndef queued_fetch_set_pending_acquire 0250 static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) 0251 { 0252 return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); 0253 } 0254 #endif 0255 0256 /** 0257 * set_locked - Set the lock bit and own the lock 0258 * @lock: Pointer to queued spinlock structure 0259 * 0260 * *,*,0 -> *,0,1 0261 */ 0262 static __always_inline void set_locked(struct qspinlock *lock) 0263 { 0264 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 0265 } 0266 0267 0268 /* 0269 * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for 0270 * all the PV callbacks. 0271 */ 0272 0273 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 0274 static __always_inline void __pv_wait_node(struct mcs_spinlock *node, 0275 struct mcs_spinlock *prev) { } 0276 static __always_inline void __pv_kick_node(struct qspinlock *lock, 0277 struct mcs_spinlock *node) { } 0278 static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, 0279 struct mcs_spinlock *node) 0280 { return 0; } 0281 0282 #define pv_enabled() false 0283 0284 #define pv_init_node __pv_init_node 0285 #define pv_wait_node __pv_wait_node 0286 #define pv_kick_node __pv_kick_node 0287 #define pv_wait_head_or_lock __pv_wait_head_or_lock 0288 0289 #ifdef CONFIG_PARAVIRT_SPINLOCKS 0290 #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath 0291 #endif 0292 0293 #endif /* _GEN_PV_LOCK_SLOWPATH */ 0294 0295 /** 0296 * queued_spin_lock_slowpath - acquire the queued spinlock 0297 * @lock: Pointer to queued spinlock structure 0298 * @val: Current value of the queued spinlock 32-bit word 0299 * 0300 * (queue tail, pending bit, lock value) 0301 * 0302 * fast : slow : unlock 0303 * : : 0304 * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 0305 * : | ^--------.------. / : 0306 * : v \ \ | : 0307 * pending : (0,1,1) +--> (0,1,0) \ | : 0308 * : | ^--' | | : 0309 * : v | | : 0310 * uncontended : (n,x,y) +--> (n,0,0) --' | : 0311 * queue : | ^--' | : 0312 * : v | : 0313 * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 0314 * queue : ^--' : 0315 */ 0316 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) 0317 { 0318 struct mcs_spinlock *prev, *next, *node; 0319 u32 old, tail; 0320 int idx; 0321 0322 BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 0323 0324 if (pv_enabled()) 0325 goto pv_queue; 0326 0327 if (virt_spin_lock(lock)) 0328 return; 0329 0330 /* 0331 * Wait for in-progress pending->locked hand-overs with a bounded 0332 * number of spins so that we guarantee forward progress. 0333 * 0334 * 0,1,0 -> 0,0,1 0335 */ 0336 if (val == _Q_PENDING_VAL) { 0337 int cnt = _Q_PENDING_LOOPS; 0338 val = atomic_cond_read_relaxed(&lock->val, 0339 (VAL != _Q_PENDING_VAL) || !cnt--); 0340 } 0341 0342 /* 0343 * If we observe any contention; queue. 0344 */ 0345 if (val & ~_Q_LOCKED_MASK) 0346 goto queue; 0347 0348 /* 0349 * trylock || pending 0350 * 0351 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 0352 */ 0353 val = queued_fetch_set_pending_acquire(lock); 0354 0355 /* 0356 * If we observe contention, there is a concurrent locker. 0357 * 0358 * Undo and queue; our setting of PENDING might have made the 0359 * n,0,0 -> 0,0,0 transition fail and it will now be waiting 0360 * on @next to become !NULL. 0361 */ 0362 if (unlikely(val & ~_Q_LOCKED_MASK)) { 0363 0364 /* Undo PENDING if we set it. */ 0365 if (!(val & _Q_PENDING_MASK)) 0366 clear_pending(lock); 0367 0368 goto queue; 0369 } 0370 0371 /* 0372 * We're pending, wait for the owner to go away. 0373 * 0374 * 0,1,1 -> 0,1,0 0375 * 0376 * this wait loop must be a load-acquire such that we match the 0377 * store-release that clears the locked bit and create lock 0378 * sequentiality; this is because not all 0379 * clear_pending_set_locked() implementations imply full 0380 * barriers. 0381 */ 0382 if (val & _Q_LOCKED_MASK) 0383 atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK)); 0384 0385 /* 0386 * take ownership and clear the pending bit. 0387 * 0388 * 0,1,0 -> 0,0,1 0389 */ 0390 clear_pending_set_locked(lock); 0391 lockevent_inc(lock_pending); 0392 return; 0393 0394 /* 0395 * End of pending bit optimistic spinning and beginning of MCS 0396 * queuing. 0397 */ 0398 queue: 0399 lockevent_inc(lock_slowpath); 0400 pv_queue: 0401 node = this_cpu_ptr(&qnodes[0].mcs); 0402 idx = node->count++; 0403 tail = encode_tail(smp_processor_id(), idx); 0404 0405 trace_contention_begin(lock, LCB_F_SPIN); 0406 0407 /* 0408 * 4 nodes are allocated based on the assumption that there will 0409 * not be nested NMIs taking spinlocks. That may not be true in 0410 * some architectures even though the chance of needing more than 0411 * 4 nodes will still be extremely unlikely. When that happens, 0412 * we fall back to spinning on the lock directly without using 0413 * any MCS node. This is not the most elegant solution, but is 0414 * simple enough. 0415 */ 0416 if (unlikely(idx >= MAX_NODES)) { 0417 lockevent_inc(lock_no_node); 0418 while (!queued_spin_trylock(lock)) 0419 cpu_relax(); 0420 goto release; 0421 } 0422 0423 node = grab_mcs_node(node, idx); 0424 0425 /* 0426 * Keep counts of non-zero index values: 0427 */ 0428 lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 0429 0430 /* 0431 * Ensure that we increment the head node->count before initialising 0432 * the actual node. If the compiler is kind enough to reorder these 0433 * stores, then an IRQ could overwrite our assignments. 0434 */ 0435 barrier(); 0436 0437 node->locked = 0; 0438 node->next = NULL; 0439 pv_init_node(node); 0440 0441 /* 0442 * We touched a (possibly) cold cacheline in the per-cpu queue node; 0443 * attempt the trylock once more in the hope someone let go while we 0444 * weren't watching. 0445 */ 0446 if (queued_spin_trylock(lock)) 0447 goto release; 0448 0449 /* 0450 * Ensure that the initialisation of @node is complete before we 0451 * publish the updated tail via xchg_tail() and potentially link 0452 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 0453 */ 0454 smp_wmb(); 0455 0456 /* 0457 * Publish the updated tail. 0458 * We have already touched the queueing cacheline; don't bother with 0459 * pending stuff. 0460 * 0461 * p,*,* -> n,*,* 0462 */ 0463 old = xchg_tail(lock, tail); 0464 next = NULL; 0465 0466 /* 0467 * if there was a previous node; link it and wait until reaching the 0468 * head of the waitqueue. 0469 */ 0470 if (old & _Q_TAIL_MASK) { 0471 prev = decode_tail(old); 0472 0473 /* Link @node into the waitqueue. */ 0474 WRITE_ONCE(prev->next, node); 0475 0476 pv_wait_node(node, prev); 0477 arch_mcs_spin_lock_contended(&node->locked); 0478 0479 /* 0480 * While waiting for the MCS lock, the next pointer may have 0481 * been set by another lock waiter. We optimistically load 0482 * the next pointer & prefetch the cacheline for writing 0483 * to reduce latency in the upcoming MCS unlock operation. 0484 */ 0485 next = READ_ONCE(node->next); 0486 if (next) 0487 prefetchw(next); 0488 } 0489 0490 /* 0491 * we're at the head of the waitqueue, wait for the owner & pending to 0492 * go away. 0493 * 0494 * *,x,y -> *,0,0 0495 * 0496 * this wait loop must use a load-acquire such that we match the 0497 * store-release that clears the locked bit and create lock 0498 * sequentiality; this is because the set_locked() function below 0499 * does not imply a full barrier. 0500 * 0501 * The PV pv_wait_head_or_lock function, if active, will acquire 0502 * the lock and return a non-zero value. So we have to skip the 0503 * atomic_cond_read_acquire() call. As the next PV queue head hasn't 0504 * been designated yet, there is no way for the locked value to become 0505 * _Q_SLOW_VAL. So both the set_locked() and the 0506 * atomic_cmpxchg_relaxed() calls will be safe. 0507 * 0508 * If PV isn't active, 0 will be returned instead. 0509 * 0510 */ 0511 if ((val = pv_wait_head_or_lock(lock, node))) 0512 goto locked; 0513 0514 val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); 0515 0516 locked: 0517 /* 0518 * claim the lock: 0519 * 0520 * n,0,0 -> 0,0,1 : lock, uncontended 0521 * *,*,0 -> *,*,1 : lock, contended 0522 * 0523 * If the queue head is the only one in the queue (lock value == tail) 0524 * and nobody is pending, clear the tail code and grab the lock. 0525 * Otherwise, we only need to grab the lock. 0526 */ 0527 0528 /* 0529 * In the PV case we might already have _Q_LOCKED_VAL set, because 0530 * of lock stealing; therefore we must also allow: 0531 * 0532 * n,0,1 -> 0,0,1 0533 * 0534 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 0535 * above wait condition, therefore any concurrent setting of 0536 * PENDING will make the uncontended transition fail. 0537 */ 0538 if ((val & _Q_TAIL_MASK) == tail) { 0539 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 0540 goto release; /* No contention */ 0541 } 0542 0543 /* 0544 * Either somebody is queued behind us or _Q_PENDING_VAL got set 0545 * which will then detect the remaining tail and queue behind us 0546 * ensuring we'll see a @next. 0547 */ 0548 set_locked(lock); 0549 0550 /* 0551 * contended path; wait for next if not observed yet, release. 0552 */ 0553 if (!next) 0554 next = smp_cond_load_relaxed(&node->next, (VAL)); 0555 0556 arch_mcs_spin_unlock_contended(&next->locked); 0557 pv_kick_node(lock, next); 0558 0559 release: 0560 trace_contention_end(lock, 0); 0561 0562 /* 0563 * release the node 0564 */ 0565 __this_cpu_dec(qnodes[0].mcs.count); 0566 } 0567 EXPORT_SYMBOL(queued_spin_lock_slowpath); 0568 0569 /* 0570 * Generate the paravirt code for queued_spin_unlock_slowpath(). 0571 */ 0572 #if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) 0573 #define _GEN_PV_LOCK_SLOWPATH 0574 0575 #undef pv_enabled 0576 #define pv_enabled() true 0577 0578 #undef pv_init_node 0579 #undef pv_wait_node 0580 #undef pv_kick_node 0581 #undef pv_wait_head_or_lock 0582 0583 #undef queued_spin_lock_slowpath 0584 #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath 0585 0586 #include "qspinlock_paravirt.h" 0587 #include "qspinlock.c" 0588 0589 bool nopvspin __initdata; 0590 static __init int parse_nopvspin(char *arg) 0591 { 0592 nopvspin = true; 0593 return 0; 0594 } 0595 early_param("nopvspin", parse_nopvspin); 0596 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |