Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 
0003 #include <linux/sched/signal.h>
0004 
0005 #include "futex.h"
0006 #include "../locking/rtmutex_common.h"
0007 
0008 /*
0009  * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
0010  * underlying rtmutex. The task which is about to be requeued could have
0011  * just woken up (timeout, signal). After the wake up the task has to
0012  * acquire hash bucket lock, which is held by the requeue code.  As a task
0013  * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
0014  * and the hash bucket lock blocking would collide and corrupt state.
0015  *
0016  * On !PREEMPT_RT this is not a problem and everything could be serialized
0017  * on hash bucket lock, but aside of having the benefit of common code,
0018  * this allows to avoid doing the requeue when the task is already on the
0019  * way out and taking the hash bucket lock of the original uaddr1 when the
0020  * requeue has been completed.
0021  *
0022  * The following state transitions are valid:
0023  *
0024  * On the waiter side:
0025  *   Q_REQUEUE_PI_NONE      -> Q_REQUEUE_PI_IGNORE
0026  *   Q_REQUEUE_PI_IN_PROGRESS   -> Q_REQUEUE_PI_WAIT
0027  *
0028  * On the requeue side:
0029  *   Q_REQUEUE_PI_NONE      -> Q_REQUEUE_PI_INPROGRESS
0030  *   Q_REQUEUE_PI_IN_PROGRESS   -> Q_REQUEUE_PI_DONE/LOCKED
0031  *   Q_REQUEUE_PI_IN_PROGRESS   -> Q_REQUEUE_PI_NONE (requeue failed)
0032  *   Q_REQUEUE_PI_WAIT      -> Q_REQUEUE_PI_DONE/LOCKED
0033  *   Q_REQUEUE_PI_WAIT      -> Q_REQUEUE_PI_IGNORE (requeue failed)
0034  *
0035  * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
0036  * signals that the waiter is already on the way out. It also means that
0037  * the waiter is still on the 'wait' futex, i.e. uaddr1.
0038  *
0039  * The waiter side signals early wakeup to the requeue side either through
0040  * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
0041  * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
0042  * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
0043  * which means the wakeup is interleaving with a requeue in progress it has
0044  * to wait for the requeue side to change the state. Either to DONE/LOCKED
0045  * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
0046  * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
0047  * the requeue side when the requeue attempt failed via deadlock detection
0048  * and therefore the waiter q is still on the uaddr1 futex.
0049  */
0050 enum {
0051     Q_REQUEUE_PI_NONE       =  0,
0052     Q_REQUEUE_PI_IGNORE,
0053     Q_REQUEUE_PI_IN_PROGRESS,
0054     Q_REQUEUE_PI_WAIT,
0055     Q_REQUEUE_PI_DONE,
0056     Q_REQUEUE_PI_LOCKED,
0057 };
0058 
0059 const struct futex_q futex_q_init = {
0060     /* list gets initialized in futex_queue()*/
0061     .key        = FUTEX_KEY_INIT,
0062     .bitset     = FUTEX_BITSET_MATCH_ANY,
0063     .requeue_state  = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
0064 };
0065 
0066 /**
0067  * requeue_futex() - Requeue a futex_q from one hb to another
0068  * @q:      the futex_q to requeue
0069  * @hb1:    the source hash_bucket
0070  * @hb2:    the target hash_bucket
0071  * @key2:   the new key for the requeued futex_q
0072  */
0073 static inline
0074 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
0075            struct futex_hash_bucket *hb2, union futex_key *key2)
0076 {
0077 
0078     /*
0079      * If key1 and key2 hash to the same bucket, no need to
0080      * requeue.
0081      */
0082     if (likely(&hb1->chain != &hb2->chain)) {
0083         plist_del(&q->list, &hb1->chain);
0084         futex_hb_waiters_dec(hb1);
0085         futex_hb_waiters_inc(hb2);
0086         plist_add(&q->list, &hb2->chain);
0087         q->lock_ptr = &hb2->lock;
0088     }
0089     q->key = *key2;
0090 }
0091 
0092 static inline bool futex_requeue_pi_prepare(struct futex_q *q,
0093                         struct futex_pi_state *pi_state)
0094 {
0095     int old, new;
0096 
0097     /*
0098      * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
0099      * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
0100      * ignore the waiter.
0101      */
0102     old = atomic_read_acquire(&q->requeue_state);
0103     do {
0104         if (old == Q_REQUEUE_PI_IGNORE)
0105             return false;
0106 
0107         /*
0108          * futex_proxy_trylock_atomic() might have set it to
0109          * IN_PROGRESS and a interleaved early wake to WAIT.
0110          *
0111          * It was considered to have an extra state for that
0112          * trylock, but that would just add more conditionals
0113          * all over the place for a dubious value.
0114          */
0115         if (old != Q_REQUEUE_PI_NONE)
0116             break;
0117 
0118         new = Q_REQUEUE_PI_IN_PROGRESS;
0119     } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
0120 
0121     q->pi_state = pi_state;
0122     return true;
0123 }
0124 
0125 static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
0126 {
0127     int old, new;
0128 
0129     old = atomic_read_acquire(&q->requeue_state);
0130     do {
0131         if (old == Q_REQUEUE_PI_IGNORE)
0132             return;
0133 
0134         if (locked >= 0) {
0135             /* Requeue succeeded. Set DONE or LOCKED */
0136             WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
0137                      old != Q_REQUEUE_PI_WAIT);
0138             new = Q_REQUEUE_PI_DONE + locked;
0139         } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
0140             /* Deadlock, no early wakeup interleave */
0141             new = Q_REQUEUE_PI_NONE;
0142         } else {
0143             /* Deadlock, early wakeup interleave. */
0144             WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
0145             new = Q_REQUEUE_PI_IGNORE;
0146         }
0147     } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
0148 
0149 #ifdef CONFIG_PREEMPT_RT
0150     /* If the waiter interleaved with the requeue let it know */
0151     if (unlikely(old == Q_REQUEUE_PI_WAIT))
0152         rcuwait_wake_up(&q->requeue_wait);
0153 #endif
0154 }
0155 
0156 static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
0157 {
0158     int old, new;
0159 
0160     old = atomic_read_acquire(&q->requeue_state);
0161     do {
0162         /* Is requeue done already? */
0163         if (old >= Q_REQUEUE_PI_DONE)
0164             return old;
0165 
0166         /*
0167          * If not done, then tell the requeue code to either ignore
0168          * the waiter or to wake it up once the requeue is done.
0169          */
0170         new = Q_REQUEUE_PI_WAIT;
0171         if (old == Q_REQUEUE_PI_NONE)
0172             new = Q_REQUEUE_PI_IGNORE;
0173     } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
0174 
0175     /* If the requeue was in progress, wait for it to complete */
0176     if (old == Q_REQUEUE_PI_IN_PROGRESS) {
0177 #ifdef CONFIG_PREEMPT_RT
0178         rcuwait_wait_event(&q->requeue_wait,
0179                    atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
0180                    TASK_UNINTERRUPTIBLE);
0181 #else
0182         (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
0183 #endif
0184     }
0185 
0186     /*
0187      * Requeue is now either prohibited or complete. Reread state
0188      * because during the wait above it might have changed. Nothing
0189      * will modify q->requeue_state after this point.
0190      */
0191     return atomic_read(&q->requeue_state);
0192 }
0193 
0194 /**
0195  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
0196  * @q:      the futex_q
0197  * @key:    the key of the requeue target futex
0198  * @hb:     the hash_bucket of the requeue target futex
0199  *
0200  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
0201  * target futex if it is uncontended or via a lock steal.
0202  *
0203  * 1) Set @q::key to the requeue target futex key so the waiter can detect
0204  *    the wakeup on the right futex.
0205  *
0206  * 2) Dequeue @q from the hash bucket.
0207  *
0208  * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
0209  *    acquisition.
0210  *
0211  * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
0212  *    the waiter has to fixup the pi state.
0213  *
0214  * 5) Complete the requeue state so the waiter can make progress. After
0215  *    this point the waiter task can return from the syscall immediately in
0216  *    case that the pi state does not have to be fixed up.
0217  *
0218  * 6) Wake the waiter task.
0219  *
0220  * Must be called with both q->lock_ptr and hb->lock held.
0221  */
0222 static inline
0223 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
0224                struct futex_hash_bucket *hb)
0225 {
0226     q->key = *key;
0227 
0228     __futex_unqueue(q);
0229 
0230     WARN_ON(!q->rt_waiter);
0231     q->rt_waiter = NULL;
0232 
0233     q->lock_ptr = &hb->lock;
0234 
0235     /* Signal locked state to the waiter */
0236     futex_requeue_pi_complete(q, 1);
0237     wake_up_state(q->task, TASK_NORMAL);
0238 }
0239 
0240 /**
0241  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
0242  * @pifutex:        the user address of the to futex
0243  * @hb1:        the from futex hash bucket, must be locked by the caller
0244  * @hb2:        the to futex hash bucket, must be locked by the caller
0245  * @key1:       the from futex key
0246  * @key2:       the to futex key
0247  * @ps:         address to store the pi_state pointer
0248  * @exiting:        Pointer to store the task pointer of the owner task
0249  *          which is in the middle of exiting
0250  * @set_waiters:    force setting the FUTEX_WAITERS bit (1) or not (0)
0251  *
0252  * Try and get the lock on behalf of the top waiter if we can do it atomically.
0253  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
0254  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
0255  * hb1 and hb2 must be held by the caller.
0256  *
0257  * @exiting is only set when the return value is -EBUSY. If so, this holds
0258  * a refcount on the exiting task on return and the caller needs to drop it
0259  * after waiting for the exit to complete.
0260  *
0261  * Return:
0262  *  -  0 - failed to acquire the lock atomically;
0263  *  - >0 - acquired the lock, return value is vpid of the top_waiter
0264  *  - <0 - error
0265  */
0266 static int
0267 futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
0268                struct futex_hash_bucket *hb2, union futex_key *key1,
0269                union futex_key *key2, struct futex_pi_state **ps,
0270                struct task_struct **exiting, int set_waiters)
0271 {
0272     struct futex_q *top_waiter = NULL;
0273     u32 curval;
0274     int ret;
0275 
0276     if (futex_get_value_locked(&curval, pifutex))
0277         return -EFAULT;
0278 
0279     if (unlikely(should_fail_futex(true)))
0280         return -EFAULT;
0281 
0282     /*
0283      * Find the top_waiter and determine if there are additional waiters.
0284      * If the caller intends to requeue more than 1 waiter to pifutex,
0285      * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
0286      * as we have means to handle the possible fault.  If not, don't set
0287      * the bit unnecessarily as it will force the subsequent unlock to enter
0288      * the kernel.
0289      */
0290     top_waiter = futex_top_waiter(hb1, key1);
0291 
0292     /* There are no waiters, nothing for us to do. */
0293     if (!top_waiter)
0294         return 0;
0295 
0296     /*
0297      * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
0298      * and waiting on the 'waitqueue' futex which is always !PI.
0299      */
0300     if (!top_waiter->rt_waiter || top_waiter->pi_state)
0301         return -EINVAL;
0302 
0303     /* Ensure we requeue to the expected futex. */
0304     if (!futex_match(top_waiter->requeue_pi_key, key2))
0305         return -EINVAL;
0306 
0307     /* Ensure that this does not race against an early wakeup */
0308     if (!futex_requeue_pi_prepare(top_waiter, NULL))
0309         return -EAGAIN;
0310 
0311     /*
0312      * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
0313      * in the contended case or if @set_waiters is true.
0314      *
0315      * In the contended case PI state is attached to the lock owner. If
0316      * the user space lock can be acquired then PI state is attached to
0317      * the new owner (@top_waiter->task) when @set_waiters is true.
0318      */
0319     ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
0320                    exiting, set_waiters);
0321     if (ret == 1) {
0322         /*
0323          * Lock was acquired in user space and PI state was
0324          * attached to @top_waiter->task. That means state is fully
0325          * consistent and the waiter can return to user space
0326          * immediately after the wakeup.
0327          */
0328         requeue_pi_wake_futex(top_waiter, key2, hb2);
0329     } else if (ret < 0) {
0330         /* Rewind top_waiter::requeue_state */
0331         futex_requeue_pi_complete(top_waiter, ret);
0332     } else {
0333         /*
0334          * futex_lock_pi_atomic() did not acquire the user space
0335          * futex, but managed to establish the proxy lock and pi
0336          * state. top_waiter::requeue_state cannot be fixed up here
0337          * because the waiter is not enqueued on the rtmutex
0338          * yet. This is handled at the callsite depending on the
0339          * result of rt_mutex_start_proxy_lock() which is
0340          * guaranteed to be reached with this function returning 0.
0341          */
0342     }
0343     return ret;
0344 }
0345 
0346 /**
0347  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
0348  * @uaddr1: source futex user address
0349  * @flags:  futex flags (FLAGS_SHARED, etc.)
0350  * @uaddr2: target futex user address
0351  * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
0352  * @nr_requeue: number of waiters to requeue (0-INT_MAX)
0353  * @cmpval: @uaddr1 expected value (or %NULL)
0354  * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
0355  *      pi futex (pi to pi requeue is not supported)
0356  *
0357  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
0358  * uaddr2 atomically on behalf of the top waiter.
0359  *
0360  * Return:
0361  *  - >=0 - on success, the number of tasks requeued or woken;
0362  *  -  <0 - on error
0363  */
0364 int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
0365           int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
0366 {
0367     union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
0368     int task_count = 0, ret;
0369     struct futex_pi_state *pi_state = NULL;
0370     struct futex_hash_bucket *hb1, *hb2;
0371     struct futex_q *this, *next;
0372     DEFINE_WAKE_Q(wake_q);
0373 
0374     if (nr_wake < 0 || nr_requeue < 0)
0375         return -EINVAL;
0376 
0377     /*
0378      * When PI not supported: return -ENOSYS if requeue_pi is true,
0379      * consequently the compiler knows requeue_pi is always false past
0380      * this point which will optimize away all the conditional code
0381      * further down.
0382      */
0383     if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
0384         return -ENOSYS;
0385 
0386     if (requeue_pi) {
0387         /*
0388          * Requeue PI only works on two distinct uaddrs. This
0389          * check is only valid for private futexes. See below.
0390          */
0391         if (uaddr1 == uaddr2)
0392             return -EINVAL;
0393 
0394         /*
0395          * futex_requeue() allows the caller to define the number
0396          * of waiters to wake up via the @nr_wake argument. With
0397          * REQUEUE_PI, waking up more than one waiter is creating
0398          * more problems than it solves. Waking up a waiter makes
0399          * only sense if the PI futex @uaddr2 is uncontended as
0400          * this allows the requeue code to acquire the futex
0401          * @uaddr2 before waking the waiter. The waiter can then
0402          * return to user space without further action. A secondary
0403          * wakeup would just make the futex_wait_requeue_pi()
0404          * handling more complex, because that code would have to
0405          * look up pi_state and do more or less all the handling
0406          * which the requeue code has to do for the to be requeued
0407          * waiters. So restrict the number of waiters to wake to
0408          * one, and only wake it up when the PI futex is
0409          * uncontended. Otherwise requeue it and let the unlock of
0410          * the PI futex handle the wakeup.
0411          *
0412          * All REQUEUE_PI users, e.g. pthread_cond_signal() and
0413          * pthread_cond_broadcast() must use nr_wake=1.
0414          */
0415         if (nr_wake != 1)
0416             return -EINVAL;
0417 
0418         /*
0419          * requeue_pi requires a pi_state, try to allocate it now
0420          * without any locks in case it fails.
0421          */
0422         if (refill_pi_state_cache())
0423             return -ENOMEM;
0424     }
0425 
0426 retry:
0427     ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
0428     if (unlikely(ret != 0))
0429         return ret;
0430     ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
0431                 requeue_pi ? FUTEX_WRITE : FUTEX_READ);
0432     if (unlikely(ret != 0))
0433         return ret;
0434 
0435     /*
0436      * The check above which compares uaddrs is not sufficient for
0437      * shared futexes. We need to compare the keys:
0438      */
0439     if (requeue_pi && futex_match(&key1, &key2))
0440         return -EINVAL;
0441 
0442     hb1 = futex_hash(&key1);
0443     hb2 = futex_hash(&key2);
0444 
0445 retry_private:
0446     futex_hb_waiters_inc(hb2);
0447     double_lock_hb(hb1, hb2);
0448 
0449     if (likely(cmpval != NULL)) {
0450         u32 curval;
0451 
0452         ret = futex_get_value_locked(&curval, uaddr1);
0453 
0454         if (unlikely(ret)) {
0455             double_unlock_hb(hb1, hb2);
0456             futex_hb_waiters_dec(hb2);
0457 
0458             ret = get_user(curval, uaddr1);
0459             if (ret)
0460                 return ret;
0461 
0462             if (!(flags & FLAGS_SHARED))
0463                 goto retry_private;
0464 
0465             goto retry;
0466         }
0467         if (curval != *cmpval) {
0468             ret = -EAGAIN;
0469             goto out_unlock;
0470         }
0471     }
0472 
0473     if (requeue_pi) {
0474         struct task_struct *exiting = NULL;
0475 
0476         /*
0477          * Attempt to acquire uaddr2 and wake the top waiter. If we
0478          * intend to requeue waiters, force setting the FUTEX_WAITERS
0479          * bit.  We force this here where we are able to easily handle
0480          * faults rather in the requeue loop below.
0481          *
0482          * Updates topwaiter::requeue_state if a top waiter exists.
0483          */
0484         ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
0485                          &key2, &pi_state,
0486                          &exiting, nr_requeue);
0487 
0488         /*
0489          * At this point the top_waiter has either taken uaddr2 or
0490          * is waiting on it. In both cases pi_state has been
0491          * established and an initial refcount on it. In case of an
0492          * error there's nothing.
0493          *
0494          * The top waiter's requeue_state is up to date:
0495          *
0496          *  - If the lock was acquired atomically (ret == 1), then
0497          *    the state is Q_REQUEUE_PI_LOCKED.
0498          *
0499          *    The top waiter has been dequeued and woken up and can
0500          *    return to user space immediately. The kernel/user
0501          *    space state is consistent. In case that there must be
0502          *    more waiters requeued the WAITERS bit in the user
0503          *    space futex is set so the top waiter task has to go
0504          *    into the syscall slowpath to unlock the futex. This
0505          *    will block until this requeue operation has been
0506          *    completed and the hash bucket locks have been
0507          *    dropped.
0508          *
0509          *  - If the trylock failed with an error (ret < 0) then
0510          *    the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
0511          *    happened", or Q_REQUEUE_PI_IGNORE when there was an
0512          *    interleaved early wakeup.
0513          *
0514          *  - If the trylock did not succeed (ret == 0) then the
0515          *    state is either Q_REQUEUE_PI_IN_PROGRESS or
0516          *    Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
0517          *    This will be cleaned up in the loop below, which
0518          *    cannot fail because futex_proxy_trylock_atomic() did
0519          *    the same sanity checks for requeue_pi as the loop
0520          *    below does.
0521          */
0522         switch (ret) {
0523         case 0:
0524             /* We hold a reference on the pi state. */
0525             break;
0526 
0527         case 1:
0528             /*
0529              * futex_proxy_trylock_atomic() acquired the user space
0530              * futex. Adjust task_count.
0531              */
0532             task_count++;
0533             ret = 0;
0534             break;
0535 
0536         /*
0537          * If the above failed, then pi_state is NULL and
0538          * waiter::requeue_state is correct.
0539          */
0540         case -EFAULT:
0541             double_unlock_hb(hb1, hb2);
0542             futex_hb_waiters_dec(hb2);
0543             ret = fault_in_user_writeable(uaddr2);
0544             if (!ret)
0545                 goto retry;
0546             return ret;
0547         case -EBUSY:
0548         case -EAGAIN:
0549             /*
0550              * Two reasons for this:
0551              * - EBUSY: Owner is exiting and we just wait for the
0552              *   exit to complete.
0553              * - EAGAIN: The user space value changed.
0554              */
0555             double_unlock_hb(hb1, hb2);
0556             futex_hb_waiters_dec(hb2);
0557             /*
0558              * Handle the case where the owner is in the middle of
0559              * exiting. Wait for the exit to complete otherwise
0560              * this task might loop forever, aka. live lock.
0561              */
0562             wait_for_owner_exiting(ret, exiting);
0563             cond_resched();
0564             goto retry;
0565         default:
0566             goto out_unlock;
0567         }
0568     }
0569 
0570     plist_for_each_entry_safe(this, next, &hb1->chain, list) {
0571         if (task_count - nr_wake >= nr_requeue)
0572             break;
0573 
0574         if (!futex_match(&this->key, &key1))
0575             continue;
0576 
0577         /*
0578          * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
0579          * be paired with each other and no other futex ops.
0580          *
0581          * We should never be requeueing a futex_q with a pi_state,
0582          * which is awaiting a futex_unlock_pi().
0583          */
0584         if ((requeue_pi && !this->rt_waiter) ||
0585             (!requeue_pi && this->rt_waiter) ||
0586             this->pi_state) {
0587             ret = -EINVAL;
0588             break;
0589         }
0590 
0591         /* Plain futexes just wake or requeue and are done */
0592         if (!requeue_pi) {
0593             if (++task_count <= nr_wake)
0594                 futex_wake_mark(&wake_q, this);
0595             else
0596                 requeue_futex(this, hb1, hb2, &key2);
0597             continue;
0598         }
0599 
0600         /* Ensure we requeue to the expected futex for requeue_pi. */
0601         if (!futex_match(this->requeue_pi_key, &key2)) {
0602             ret = -EINVAL;
0603             break;
0604         }
0605 
0606         /*
0607          * Requeue nr_requeue waiters and possibly one more in the case
0608          * of requeue_pi if we couldn't acquire the lock atomically.
0609          *
0610          * Prepare the waiter to take the rt_mutex. Take a refcount
0611          * on the pi_state and store the pointer in the futex_q
0612          * object of the waiter.
0613          */
0614         get_pi_state(pi_state);
0615 
0616         /* Don't requeue when the waiter is already on the way out. */
0617         if (!futex_requeue_pi_prepare(this, pi_state)) {
0618             /*
0619              * Early woken waiter signaled that it is on the
0620              * way out. Drop the pi_state reference and try the
0621              * next waiter. @this->pi_state is still NULL.
0622              */
0623             put_pi_state(pi_state);
0624             continue;
0625         }
0626 
0627         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
0628                         this->rt_waiter,
0629                         this->task);
0630 
0631         if (ret == 1) {
0632             /*
0633              * We got the lock. We do neither drop the refcount
0634              * on pi_state nor clear this->pi_state because the
0635              * waiter needs the pi_state for cleaning up the
0636              * user space value. It will drop the refcount
0637              * after doing so. this::requeue_state is updated
0638              * in the wakeup as well.
0639              */
0640             requeue_pi_wake_futex(this, &key2, hb2);
0641             task_count++;
0642         } else if (!ret) {
0643             /* Waiter is queued, move it to hb2 */
0644             requeue_futex(this, hb1, hb2, &key2);
0645             futex_requeue_pi_complete(this, 0);
0646             task_count++;
0647         } else {
0648             /*
0649              * rt_mutex_start_proxy_lock() detected a potential
0650              * deadlock when we tried to queue that waiter.
0651              * Drop the pi_state reference which we took above
0652              * and remove the pointer to the state from the
0653              * waiters futex_q object.
0654              */
0655             this->pi_state = NULL;
0656             put_pi_state(pi_state);
0657             futex_requeue_pi_complete(this, ret);
0658             /*
0659              * We stop queueing more waiters and let user space
0660              * deal with the mess.
0661              */
0662             break;
0663         }
0664     }
0665 
0666     /*
0667      * We took an extra initial reference to the pi_state in
0668      * futex_proxy_trylock_atomic(). We need to drop it here again.
0669      */
0670     put_pi_state(pi_state);
0671 
0672 out_unlock:
0673     double_unlock_hb(hb1, hb2);
0674     wake_up_q(&wake_q);
0675     futex_hb_waiters_dec(hb2);
0676     return ret ? ret : task_count;
0677 }
0678 
0679 /**
0680  * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
0681  * @hb:     the hash_bucket futex_q was original enqueued on
0682  * @q:      the futex_q woken while waiting to be requeued
0683  * @timeout:    the timeout associated with the wait (NULL if none)
0684  *
0685  * Determine the cause for the early wakeup.
0686  *
0687  * Return:
0688  *  -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
0689  */
0690 static inline
0691 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
0692                    struct futex_q *q,
0693                    struct hrtimer_sleeper *timeout)
0694 {
0695     int ret;
0696 
0697     /*
0698      * With the hb lock held, we avoid races while we process the wakeup.
0699      * We only need to hold hb (and not hb2) to ensure atomicity as the
0700      * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
0701      * It can't be requeued from uaddr2 to something else since we don't
0702      * support a PI aware source futex for requeue.
0703      */
0704     WARN_ON_ONCE(&hb->lock != q->lock_ptr);
0705 
0706     /*
0707      * We were woken prior to requeue by a timeout or a signal.
0708      * Unqueue the futex_q and determine which it was.
0709      */
0710     plist_del(&q->list, &hb->chain);
0711     futex_hb_waiters_dec(hb);
0712 
0713     /* Handle spurious wakeups gracefully */
0714     ret = -EWOULDBLOCK;
0715     if (timeout && !timeout->task)
0716         ret = -ETIMEDOUT;
0717     else if (signal_pending(current))
0718         ret = -ERESTARTNOINTR;
0719     return ret;
0720 }
0721 
0722 /**
0723  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
0724  * @uaddr:  the futex we initially wait on (non-pi)
0725  * @flags:  futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
0726  *      the same type, no requeueing from private to shared, etc.
0727  * @val:    the expected value of uaddr
0728  * @abs_time:   absolute timeout
0729  * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
0730  * @uaddr2: the pi futex we will take prior to returning to user-space
0731  *
0732  * The caller will wait on uaddr and will be requeued by futex_requeue() to
0733  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
0734  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
0735  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
0736  * without one, the pi logic would not know which task to boost/deboost, if
0737  * there was a need to.
0738  *
0739  * We call schedule in futex_wait_queue() when we enqueue and return there
0740  * via the following--
0741  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
0742  * 2) wakeup on uaddr2 after a requeue
0743  * 3) signal
0744  * 4) timeout
0745  *
0746  * If 3, cleanup and return -ERESTARTNOINTR.
0747  *
0748  * If 2, we may then block on trying to take the rt_mutex and return via:
0749  * 5) successful lock
0750  * 6) signal
0751  * 7) timeout
0752  * 8) other lock acquisition failure
0753  *
0754  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
0755  *
0756  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
0757  *
0758  * Return:
0759  *  -  0 - On success;
0760  *  - <0 - On error
0761  */
0762 int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
0763               u32 val, ktime_t *abs_time, u32 bitset,
0764               u32 __user *uaddr2)
0765 {
0766     struct hrtimer_sleeper timeout, *to;
0767     struct rt_mutex_waiter rt_waiter;
0768     struct futex_hash_bucket *hb;
0769     union futex_key key2 = FUTEX_KEY_INIT;
0770     struct futex_q q = futex_q_init;
0771     struct rt_mutex_base *pi_mutex;
0772     int res, ret;
0773 
0774     if (!IS_ENABLED(CONFIG_FUTEX_PI))
0775         return -ENOSYS;
0776 
0777     if (uaddr == uaddr2)
0778         return -EINVAL;
0779 
0780     if (!bitset)
0781         return -EINVAL;
0782 
0783     to = futex_setup_timer(abs_time, &timeout, flags,
0784                    current->timer_slack_ns);
0785 
0786     /*
0787      * The waiter is allocated on our stack, manipulated by the requeue
0788      * code while we sleep on uaddr.
0789      */
0790     rt_mutex_init_waiter(&rt_waiter);
0791 
0792     ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
0793     if (unlikely(ret != 0))
0794         goto out;
0795 
0796     q.bitset = bitset;
0797     q.rt_waiter = &rt_waiter;
0798     q.requeue_pi_key = &key2;
0799 
0800     /*
0801      * Prepare to wait on uaddr. On success, it holds hb->lock and q
0802      * is initialized.
0803      */
0804     ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
0805     if (ret)
0806         goto out;
0807 
0808     /*
0809      * The check above which compares uaddrs is not sufficient for
0810      * shared futexes. We need to compare the keys:
0811      */
0812     if (futex_match(&q.key, &key2)) {
0813         futex_q_unlock(hb);
0814         ret = -EINVAL;
0815         goto out;
0816     }
0817 
0818     /* Queue the futex_q, drop the hb lock, wait for wakeup. */
0819     futex_wait_queue(hb, &q, to);
0820 
0821     switch (futex_requeue_pi_wakeup_sync(&q)) {
0822     case Q_REQUEUE_PI_IGNORE:
0823         /* The waiter is still on uaddr1 */
0824         spin_lock(&hb->lock);
0825         ret = handle_early_requeue_pi_wakeup(hb, &q, to);
0826         spin_unlock(&hb->lock);
0827         break;
0828 
0829     case Q_REQUEUE_PI_LOCKED:
0830         /* The requeue acquired the lock */
0831         if (q.pi_state && (q.pi_state->owner != current)) {
0832             spin_lock(q.lock_ptr);
0833             ret = fixup_pi_owner(uaddr2, &q, true);
0834             /*
0835              * Drop the reference to the pi state which the
0836              * requeue_pi() code acquired for us.
0837              */
0838             put_pi_state(q.pi_state);
0839             spin_unlock(q.lock_ptr);
0840             /*
0841              * Adjust the return value. It's either -EFAULT or
0842              * success (1) but the caller expects 0 for success.
0843              */
0844             ret = ret < 0 ? ret : 0;
0845         }
0846         break;
0847 
0848     case Q_REQUEUE_PI_DONE:
0849         /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
0850         pi_mutex = &q.pi_state->pi_mutex;
0851         ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
0852 
0853         /* Current is not longer pi_blocked_on */
0854         spin_lock(q.lock_ptr);
0855         if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
0856             ret = 0;
0857 
0858         debug_rt_mutex_free_waiter(&rt_waiter);
0859         /*
0860          * Fixup the pi_state owner and possibly acquire the lock if we
0861          * haven't already.
0862          */
0863         res = fixup_pi_owner(uaddr2, &q, !ret);
0864         /*
0865          * If fixup_pi_owner() returned an error, propagate that.  If it
0866          * acquired the lock, clear -ETIMEDOUT or -EINTR.
0867          */
0868         if (res)
0869             ret = (res < 0) ? res : 0;
0870 
0871         futex_unqueue_pi(&q);
0872         spin_unlock(q.lock_ptr);
0873 
0874         if (ret == -EINTR) {
0875             /*
0876              * We've already been requeued, but cannot restart
0877              * by calling futex_lock_pi() directly. We could
0878              * restart this syscall, but it would detect that
0879              * the user space "val" changed and return
0880              * -EWOULDBLOCK.  Save the overhead of the restart
0881              * and return -EWOULDBLOCK directly.
0882              */
0883             ret = -EWOULDBLOCK;
0884         }
0885         break;
0886     default:
0887         BUG();
0888     }
0889 
0890 out:
0891     if (to) {
0892         hrtimer_cancel(&to->timer);
0893         destroy_hrtimer_on_stack(&to->timer);
0894     }
0895     return ret;
0896 }
0897