kernel/locking/osq_lock.c

0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <linux/percpu.h>
0003 #include <linux/sched.h>
0004 #include <linux/osq_lock.h>
0005
0006 /*
0007  * An MCS like lock especially tailored for optimistic spinning for sleeping
0008  * lock implementations (mutex, rwsem, etc).
0009  *
0010  * Using a single mcs node per CPU is safe because sleeping locks should not be
0011  * called from interrupt context and we have preemption disabled while
0012  * spinning.
0013  */
0014 static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
0015
0016 /*
0017  * We use the value 0 to represent "no CPU", thus the encoded value
0018  * will be the CPU number incremented by 1.
0019  */
0020 static inline int encode_cpu(int cpu_nr)
0021 {
0022     return cpu_nr + 1;
0023 }
0024
0025 static inline int node_cpu(struct optimistic_spin_node *node)
0026 {
0027     return node->cpu - 1;
0028 }
0029
0030 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
0031 {
0032     int cpu_nr = encoded_cpu_val - 1;
0033
0034     return per_cpu_ptr(&osq_node, cpu_nr);
0035 }
0036
0037 /*
0038  * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
0039  * Can return NULL in case we were the last queued and we updated @lock instead.
0040  */
0041 static inline struct optimistic_spin_node *
0042 osq_wait_next(struct optimistic_spin_queue *lock,
0043           struct optimistic_spin_node *node,
0044           struct optimistic_spin_node *prev)
0045 {
0046     struct optimistic_spin_node *next = NULL;
0047     int curr = encode_cpu(smp_processor_id());
0048     int old;
0049
0050     /*
0051      * If there is a prev node in queue, then the 'old' value will be
0052      * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
0053      * we're currently last in queue, then the queue will then become empty.
0054      */
0055     old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
0056
0057     for (;;) {
0058         if (atomic_read(&lock->tail) == curr &&
0059             atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
0060             /*
0061              * We were the last queued, we moved @lock back. @prev
0062              * will now observe @lock and will complete its
0063              * unlock()/unqueue().
0064              */
0065             break;
0066         }
0067
0068         /*
0069          * We must xchg() the @node->next value, because if we were to
0070          * leave it in, a concurrent unlock()/unqueue() from
0071          * @node->next might complete Step-A and think its @prev is
0072          * still valid.
0073          *
0074          * If the concurrent unlock()/unqueue() wins the race, we'll
0075          * wait for either @lock to point to us, through its Step-B, or
0076          * wait for a new @node->next from its Step-C.
0077          */
0078         if (node->next) {
0079             next = xchg(&node->next, NULL);
0080             if (next)
0081                 break;
0082         }
0083
0084         cpu_relax();
0085     }
0086
0087     return next;
0088 }
0089
0090 bool osq_lock(struct optimistic_spin_queue *lock)
0091 {
0092     struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
0093     struct optimistic_spin_node *prev, *next;
0094     int curr = encode_cpu(smp_processor_id());
0095     int old;
0096
0097     node->locked = 0;
0098     node->next = NULL;
0099     node->cpu = curr;
0100
0101     /*
0102      * We need both ACQUIRE (pairs with corresponding RELEASE in
0103      * unlock() uncontended, or fastpath) and RELEASE (to publish
0104      * the node fields we just initialised) semantics when updating
0105      * the lock tail.
0106      */
0107     old = atomic_xchg(&lock->tail, curr);
0108     if (old == OSQ_UNLOCKED_VAL)
0109         return true;
0110
0111     prev = decode_cpu(old);
0112     node->prev = prev;
0113
0114     /*
0115      * osq_lock()           unqueue
0116      *
0117      * node->prev = prev        osq_wait_next()
0118      * WMB              MB
0119      * prev->next = node        next->prev = prev // unqueue-C
0120      *
0121      * Here 'node->prev' and 'next->prev' are the same variable and we need
0122      * to ensure these stores happen in-order to avoid corrupting the list.
0123      */
0124     smp_wmb();
0125
0126     WRITE_ONCE(prev->next, node);
0127
0128     /*
0129      * Normally @prev is untouchable after the above store; because at that
0130      * moment unlock can proceed and wipe the node element from stack.
0131      *
0132      * However, since our nodes are static per-cpu storage, we're
0133      * guaranteed their existence -- this allows us to apply
0134      * cmpxchg in an attempt to undo our queueing.
0135      */
0136
0137     /*
0138      * Wait to acquire the lock or cancellation. Note that need_resched()
0139      * will come with an IPI, which will wake smp_cond_load_relaxed() if it
0140      * is implemented with a monitor-wait. vcpu_is_preempted() relies on
0141      * polling, be careful.
0142      */
0143     if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
0144                   vcpu_is_preempted(node_cpu(node->prev))))
0145         return true;
0146
0147     /* unqueue */
0148     /*
0149      * Step - A  -- stabilize @prev
0150      *
0151      * Undo our @prev->next assignment; this will make @prev's
0152      * unlock()/unqueue() wait for a next pointer since @lock points to us
0153      * (or later).
0154      */
0155
0156     for (;;) {
0157         /*
0158          * cpu_relax() below implies a compiler barrier which would
0159          * prevent this comparison being optimized away.
0160          */
0161         if (data_race(prev->next) == node &&
0162             cmpxchg(&prev->next, node, NULL) == node)
0163             break;
0164
0165         /*
0166          * We can only fail the cmpxchg() racing against an unlock(),
0167          * in which case we should observe @node->locked becoming
0168          * true.
0169          */
0170         if (smp_load_acquire(&node->locked))
0171             return true;
0172
0173         cpu_relax();
0174
0175         /*
0176          * Or we race against a concurrent unqueue()'s step-B, in which
0177          * case its step-C will write us a new @node->prev pointer.
0178          */
0179         prev = READ_ONCE(node->prev);
0180     }
0181
0182     /*
0183      * Step - B -- stabilize @next
0184      *
0185      * Similar to unlock(), wait for @node->next or move @lock from @node
0186      * back to @prev.
0187      */
0188
0189     next = osq_wait_next(lock, node, prev);
0190     if (!next)
0191         return false;
0192
0193     /*
0194      * Step - C -- unlink
0195      *
0196      * @prev is stable because its still waiting for a new @prev->next
0197      * pointer, @next is stable because our @node->next pointer is NULL and
0198      * it will wait in Step-A.
0199      */
0200
0201     WRITE_ONCE(next->prev, prev);
0202     WRITE_ONCE(prev->next, next);
0203
0204     return false;
0205 }
0206
0207 void osq_unlock(struct optimistic_spin_queue *lock)
0208 {
0209     struct optimistic_spin_node *node, *next;
0210     int curr = encode_cpu(smp_processor_id());
0211
0212     /*
0213      * Fast path for the uncontended case.
0214      */
0215     if (likely(atomic_cmpxchg_release(&lock->tail, curr,
0216                       OSQ_UNLOCKED_VAL) == curr))
0217         return;
0218
0219     /*
0220      * Second most likely case.
0221      */
0222     node = this_cpu_ptr(&osq_node);
0223     next = xchg(&node->next, NULL);
0224     if (next) {
0225         WRITE_ONCE(next->locked, 1);
0226         return;
0227     }
0228
0229     next = osq_wait_next(lock, node, NULL);
0230     if (next)
0231         WRITE_ONCE(next->locked, 1);
0232 }