Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 
0003 #include <linux/slab.h>
0004 #include <linux/sched/task.h>
0005 
0006 #include "futex.h"
0007 #include "../locking/rtmutex_common.h"
0008 
0009 /*
0010  * PI code:
0011  */
0012 int refill_pi_state_cache(void)
0013 {
0014     struct futex_pi_state *pi_state;
0015 
0016     if (likely(current->pi_state_cache))
0017         return 0;
0018 
0019     pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
0020 
0021     if (!pi_state)
0022         return -ENOMEM;
0023 
0024     INIT_LIST_HEAD(&pi_state->list);
0025     /* pi_mutex gets initialized later */
0026     pi_state->owner = NULL;
0027     refcount_set(&pi_state->refcount, 1);
0028     pi_state->key = FUTEX_KEY_INIT;
0029 
0030     current->pi_state_cache = pi_state;
0031 
0032     return 0;
0033 }
0034 
0035 static struct futex_pi_state *alloc_pi_state(void)
0036 {
0037     struct futex_pi_state *pi_state = current->pi_state_cache;
0038 
0039     WARN_ON(!pi_state);
0040     current->pi_state_cache = NULL;
0041 
0042     return pi_state;
0043 }
0044 
0045 static void pi_state_update_owner(struct futex_pi_state *pi_state,
0046                   struct task_struct *new_owner)
0047 {
0048     struct task_struct *old_owner = pi_state->owner;
0049 
0050     lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
0051 
0052     if (old_owner) {
0053         raw_spin_lock(&old_owner->pi_lock);
0054         WARN_ON(list_empty(&pi_state->list));
0055         list_del_init(&pi_state->list);
0056         raw_spin_unlock(&old_owner->pi_lock);
0057     }
0058 
0059     if (new_owner) {
0060         raw_spin_lock(&new_owner->pi_lock);
0061         WARN_ON(!list_empty(&pi_state->list));
0062         list_add(&pi_state->list, &new_owner->pi_state_list);
0063         pi_state->owner = new_owner;
0064         raw_spin_unlock(&new_owner->pi_lock);
0065     }
0066 }
0067 
0068 void get_pi_state(struct futex_pi_state *pi_state)
0069 {
0070     WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
0071 }
0072 
0073 /*
0074  * Drops a reference to the pi_state object and frees or caches it
0075  * when the last reference is gone.
0076  */
0077 void put_pi_state(struct futex_pi_state *pi_state)
0078 {
0079     if (!pi_state)
0080         return;
0081 
0082     if (!refcount_dec_and_test(&pi_state->refcount))
0083         return;
0084 
0085     /*
0086      * If pi_state->owner is NULL, the owner is most probably dying
0087      * and has cleaned up the pi_state already
0088      */
0089     if (pi_state->owner) {
0090         unsigned long flags;
0091 
0092         raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
0093         pi_state_update_owner(pi_state, NULL);
0094         rt_mutex_proxy_unlock(&pi_state->pi_mutex);
0095         raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
0096     }
0097 
0098     if (current->pi_state_cache) {
0099         kfree(pi_state);
0100     } else {
0101         /*
0102          * pi_state->list is already empty.
0103          * clear pi_state->owner.
0104          * refcount is at 0 - put it back to 1.
0105          */
0106         pi_state->owner = NULL;
0107         refcount_set(&pi_state->refcount, 1);
0108         current->pi_state_cache = pi_state;
0109     }
0110 }
0111 
0112 /*
0113  * We need to check the following states:
0114  *
0115  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
0116  *
0117  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
0118  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
0119  *
0120  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
0121  *
0122  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
0123  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
0124  *
0125  * [6]  Found  | Found    | task      | 0         | 1      | Valid
0126  *
0127  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
0128  *
0129  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
0130  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
0131  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
0132  *
0133  * [1]  Indicates that the kernel can acquire the futex atomically. We
0134  *  came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
0135  *
0136  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
0137  *      thread is found then it indicates that the owner TID has died.
0138  *
0139  * [3]  Invalid. The waiter is queued on a non PI futex
0140  *
0141  * [4]  Valid state after exit_robust_list(), which sets the user space
0142  *  value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
0143  *
0144  * [5]  The user space value got manipulated between exit_robust_list()
0145  *  and exit_pi_state_list()
0146  *
0147  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
0148  *  the pi_state but cannot access the user space value.
0149  *
0150  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
0151  *
0152  * [8]  Owner and user space value match
0153  *
0154  * [9]  There is no transient state which sets the user space TID to 0
0155  *  except exit_robust_list(), but this is indicated by the
0156  *  FUTEX_OWNER_DIED bit. See [4]
0157  *
0158  * [10] There is no transient state which leaves owner and user space
0159  *  TID out of sync. Except one error case where the kernel is denied
0160  *  write access to the user address, see fixup_pi_state_owner().
0161  *
0162  *
0163  * Serialization and lifetime rules:
0164  *
0165  * hb->lock:
0166  *
0167  *  hb -> futex_q, relation
0168  *  futex_q -> pi_state, relation
0169  *
0170  *  (cannot be raw because hb can contain arbitrary amount
0171  *   of futex_q's)
0172  *
0173  * pi_mutex->wait_lock:
0174  *
0175  *  {uval, pi_state}
0176  *
0177  *  (and pi_mutex 'obviously')
0178  *
0179  * p->pi_lock:
0180  *
0181  *  p->pi_state_list -> pi_state->list, relation
0182  *  pi_mutex->owner -> pi_state->owner, relation
0183  *
0184  * pi_state->refcount:
0185  *
0186  *  pi_state lifetime
0187  *
0188  *
0189  * Lock order:
0190  *
0191  *   hb->lock
0192  *     pi_mutex->wait_lock
0193  *       p->pi_lock
0194  *
0195  */
0196 
0197 /*
0198  * Validate that the existing waiter has a pi_state and sanity check
0199  * the pi_state against the user space value. If correct, attach to
0200  * it.
0201  */
0202 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
0203                   struct futex_pi_state *pi_state,
0204                   struct futex_pi_state **ps)
0205 {
0206     pid_t pid = uval & FUTEX_TID_MASK;
0207     u32 uval2;
0208     int ret;
0209 
0210     /*
0211      * Userspace might have messed up non-PI and PI futexes [3]
0212      */
0213     if (unlikely(!pi_state))
0214         return -EINVAL;
0215 
0216     /*
0217      * We get here with hb->lock held, and having found a
0218      * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
0219      * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
0220      * which in turn means that futex_lock_pi() still has a reference on
0221      * our pi_state.
0222      *
0223      * The waiter holding a reference on @pi_state also protects against
0224      * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
0225      * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
0226      * free pi_state before we can take a reference ourselves.
0227      */
0228     WARN_ON(!refcount_read(&pi_state->refcount));
0229 
0230     /*
0231      * Now that we have a pi_state, we can acquire wait_lock
0232      * and do the state validation.
0233      */
0234     raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
0235 
0236     /*
0237      * Since {uval, pi_state} is serialized by wait_lock, and our current
0238      * uval was read without holding it, it can have changed. Verify it
0239      * still is what we expect it to be, otherwise retry the entire
0240      * operation.
0241      */
0242     if (futex_get_value_locked(&uval2, uaddr))
0243         goto out_efault;
0244 
0245     if (uval != uval2)
0246         goto out_eagain;
0247 
0248     /*
0249      * Handle the owner died case:
0250      */
0251     if (uval & FUTEX_OWNER_DIED) {
0252         /*
0253          * exit_pi_state_list sets owner to NULL and wakes the
0254          * topmost waiter. The task which acquires the
0255          * pi_state->rt_mutex will fixup owner.
0256          */
0257         if (!pi_state->owner) {
0258             /*
0259              * No pi state owner, but the user space TID
0260              * is not 0. Inconsistent state. [5]
0261              */
0262             if (pid)
0263                 goto out_einval;
0264             /*
0265              * Take a ref on the state and return success. [4]
0266              */
0267             goto out_attach;
0268         }
0269 
0270         /*
0271          * If TID is 0, then either the dying owner has not
0272          * yet executed exit_pi_state_list() or some waiter
0273          * acquired the rtmutex in the pi state, but did not
0274          * yet fixup the TID in user space.
0275          *
0276          * Take a ref on the state and return success. [6]
0277          */
0278         if (!pid)
0279             goto out_attach;
0280     } else {
0281         /*
0282          * If the owner died bit is not set, then the pi_state
0283          * must have an owner. [7]
0284          */
0285         if (!pi_state->owner)
0286             goto out_einval;
0287     }
0288 
0289     /*
0290      * Bail out if user space manipulated the futex value. If pi
0291      * state exists then the owner TID must be the same as the
0292      * user space TID. [9/10]
0293      */
0294     if (pid != task_pid_vnr(pi_state->owner))
0295         goto out_einval;
0296 
0297 out_attach:
0298     get_pi_state(pi_state);
0299     raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
0300     *ps = pi_state;
0301     return 0;
0302 
0303 out_einval:
0304     ret = -EINVAL;
0305     goto out_error;
0306 
0307 out_eagain:
0308     ret = -EAGAIN;
0309     goto out_error;
0310 
0311 out_efault:
0312     ret = -EFAULT;
0313     goto out_error;
0314 
0315 out_error:
0316     raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
0317     return ret;
0318 }
0319 
0320 static int handle_exit_race(u32 __user *uaddr, u32 uval,
0321                 struct task_struct *tsk)
0322 {
0323     u32 uval2;
0324 
0325     /*
0326      * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
0327      * caller that the alleged owner is busy.
0328      */
0329     if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
0330         return -EBUSY;
0331 
0332     /*
0333      * Reread the user space value to handle the following situation:
0334      *
0335      * CPU0             CPU1
0336      *
0337      * sys_exit()           sys_futex()
0338      *  do_exit()            futex_lock_pi()
0339      *                                futex_lock_pi_atomic()
0340      *   exit_signals(tsk)          No waiters:
0341      *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
0342      *  mm_release(tsk)         Set waiter bit
0343      *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
0344      *      Set owner died          attach_to_pi_owner() {
0345      *    *uaddr = 0xC0000000;       tsk = get_task(PID);
0346      *   }                   if (!tsk->flags & PF_EXITING) {
0347      *  ...                    attach();
0348      *  tsk->futex_state =               } else {
0349      *  FUTEX_STATE_DEAD;              if (tsk->futex_state !=
0350      *                    FUTEX_STATE_DEAD)
0351      *                       return -EAGAIN;
0352      *                     return -ESRCH; <--- FAIL
0353      *                   }
0354      *
0355      * Returning ESRCH unconditionally is wrong here because the
0356      * user space value has been changed by the exiting task.
0357      *
0358      * The same logic applies to the case where the exiting task is
0359      * already gone.
0360      */
0361     if (futex_get_value_locked(&uval2, uaddr))
0362         return -EFAULT;
0363 
0364     /* If the user space value has changed, try again. */
0365     if (uval2 != uval)
0366         return -EAGAIN;
0367 
0368     /*
0369      * The exiting task did not have a robust list, the robust list was
0370      * corrupted or the user space value in *uaddr is simply bogus.
0371      * Give up and tell user space.
0372      */
0373     return -ESRCH;
0374 }
0375 
0376 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
0377                  struct futex_pi_state **ps)
0378 {
0379     /*
0380      * No existing pi state. First waiter. [2]
0381      *
0382      * This creates pi_state, we have hb->lock held, this means nothing can
0383      * observe this state, wait_lock is irrelevant.
0384      */
0385     struct futex_pi_state *pi_state = alloc_pi_state();
0386 
0387     /*
0388      * Initialize the pi_mutex in locked state and make @p
0389      * the owner of it:
0390      */
0391     rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
0392 
0393     /* Store the key for possible exit cleanups: */
0394     pi_state->key = *key;
0395 
0396     WARN_ON(!list_empty(&pi_state->list));
0397     list_add(&pi_state->list, &p->pi_state_list);
0398     /*
0399      * Assignment without holding pi_state->pi_mutex.wait_lock is safe
0400      * because there is no concurrency as the object is not published yet.
0401      */
0402     pi_state->owner = p;
0403 
0404     *ps = pi_state;
0405 }
0406 /*
0407  * Lookup the task for the TID provided from user space and attach to
0408  * it after doing proper sanity checks.
0409  */
0410 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
0411                   struct futex_pi_state **ps,
0412                   struct task_struct **exiting)
0413 {
0414     pid_t pid = uval & FUTEX_TID_MASK;
0415     struct task_struct *p;
0416 
0417     /*
0418      * We are the first waiter - try to look up the real owner and attach
0419      * the new pi_state to it, but bail out when TID = 0 [1]
0420      *
0421      * The !pid check is paranoid. None of the call sites should end up
0422      * with pid == 0, but better safe than sorry. Let the caller retry
0423      */
0424     if (!pid)
0425         return -EAGAIN;
0426     p = find_get_task_by_vpid(pid);
0427     if (!p)
0428         return handle_exit_race(uaddr, uval, NULL);
0429 
0430     if (unlikely(p->flags & PF_KTHREAD)) {
0431         put_task_struct(p);
0432         return -EPERM;
0433     }
0434 
0435     /*
0436      * We need to look at the task state to figure out, whether the
0437      * task is exiting. To protect against the change of the task state
0438      * in futex_exit_release(), we do this protected by p->pi_lock:
0439      */
0440     raw_spin_lock_irq(&p->pi_lock);
0441     if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
0442         /*
0443          * The task is on the way out. When the futex state is
0444          * FUTEX_STATE_DEAD, we know that the task has finished
0445          * the cleanup:
0446          */
0447         int ret = handle_exit_race(uaddr, uval, p);
0448 
0449         raw_spin_unlock_irq(&p->pi_lock);
0450         /*
0451          * If the owner task is between FUTEX_STATE_EXITING and
0452          * FUTEX_STATE_DEAD then store the task pointer and keep
0453          * the reference on the task struct. The calling code will
0454          * drop all locks, wait for the task to reach
0455          * FUTEX_STATE_DEAD and then drop the refcount. This is
0456          * required to prevent a live lock when the current task
0457          * preempted the exiting task between the two states.
0458          */
0459         if (ret == -EBUSY)
0460             *exiting = p;
0461         else
0462             put_task_struct(p);
0463         return ret;
0464     }
0465 
0466     __attach_to_pi_owner(p, key, ps);
0467     raw_spin_unlock_irq(&p->pi_lock);
0468 
0469     put_task_struct(p);
0470 
0471     return 0;
0472 }
0473 
0474 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
0475 {
0476     int err;
0477     u32 curval;
0478 
0479     if (unlikely(should_fail_futex(true)))
0480         return -EFAULT;
0481 
0482     err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
0483     if (unlikely(err))
0484         return err;
0485 
0486     /* If user space value changed, let the caller retry */
0487     return curval != uval ? -EAGAIN : 0;
0488 }
0489 
0490 /**
0491  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
0492  * @uaddr:      the pi futex user address
0493  * @hb:         the pi futex hash bucket
0494  * @key:        the futex key associated with uaddr and hb
0495  * @ps:         the pi_state pointer where we store the result of the
0496  *          lookup
0497  * @task:       the task to perform the atomic lock work for.  This will
0498  *          be "current" except in the case of requeue pi.
0499  * @exiting:        Pointer to store the task pointer of the owner task
0500  *          which is in the middle of exiting
0501  * @set_waiters:    force setting the FUTEX_WAITERS bit (1) or not (0)
0502  *
0503  * Return:
0504  *  -  0 - ready to wait;
0505  *  -  1 - acquired the lock;
0506  *  - <0 - error
0507  *
0508  * The hb->lock must be held by the caller.
0509  *
0510  * @exiting is only set when the return value is -EBUSY. If so, this holds
0511  * a refcount on the exiting task on return and the caller needs to drop it
0512  * after waiting for the exit to complete.
0513  */
0514 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
0515              union futex_key *key,
0516              struct futex_pi_state **ps,
0517              struct task_struct *task,
0518              struct task_struct **exiting,
0519              int set_waiters)
0520 {
0521     u32 uval, newval, vpid = task_pid_vnr(task);
0522     struct futex_q *top_waiter;
0523     int ret;
0524 
0525     /*
0526      * Read the user space value first so we can validate a few
0527      * things before proceeding further.
0528      */
0529     if (futex_get_value_locked(&uval, uaddr))
0530         return -EFAULT;
0531 
0532     if (unlikely(should_fail_futex(true)))
0533         return -EFAULT;
0534 
0535     /*
0536      * Detect deadlocks.
0537      */
0538     if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
0539         return -EDEADLK;
0540 
0541     if ((unlikely(should_fail_futex(true))))
0542         return -EDEADLK;
0543 
0544     /*
0545      * Lookup existing state first. If it exists, try to attach to
0546      * its pi_state.
0547      */
0548     top_waiter = futex_top_waiter(hb, key);
0549     if (top_waiter)
0550         return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
0551 
0552     /*
0553      * No waiter and user TID is 0. We are here because the
0554      * waiters or the owner died bit is set or called from
0555      * requeue_cmp_pi or for whatever reason something took the
0556      * syscall.
0557      */
0558     if (!(uval & FUTEX_TID_MASK)) {
0559         /*
0560          * We take over the futex. No other waiters and the user space
0561          * TID is 0. We preserve the owner died bit.
0562          */
0563         newval = uval & FUTEX_OWNER_DIED;
0564         newval |= vpid;
0565 
0566         /* The futex requeue_pi code can enforce the waiters bit */
0567         if (set_waiters)
0568             newval |= FUTEX_WAITERS;
0569 
0570         ret = lock_pi_update_atomic(uaddr, uval, newval);
0571         if (ret)
0572             return ret;
0573 
0574         /*
0575          * If the waiter bit was requested the caller also needs PI
0576          * state attached to the new owner of the user space futex.
0577          *
0578          * @task is guaranteed to be alive and it cannot be exiting
0579          * because it is either sleeping or waiting in
0580          * futex_requeue_pi_wakeup_sync().
0581          *
0582          * No need to do the full attach_to_pi_owner() exercise
0583          * because @task is known and valid.
0584          */
0585         if (set_waiters) {
0586             raw_spin_lock_irq(&task->pi_lock);
0587             __attach_to_pi_owner(task, key, ps);
0588             raw_spin_unlock_irq(&task->pi_lock);
0589         }
0590         return 1;
0591     }
0592 
0593     /*
0594      * First waiter. Set the waiters bit before attaching ourself to
0595      * the owner. If owner tries to unlock, it will be forced into
0596      * the kernel and blocked on hb->lock.
0597      */
0598     newval = uval | FUTEX_WAITERS;
0599     ret = lock_pi_update_atomic(uaddr, uval, newval);
0600     if (ret)
0601         return ret;
0602     /*
0603      * If the update of the user space value succeeded, we try to
0604      * attach to the owner. If that fails, no harm done, we only
0605      * set the FUTEX_WAITERS bit in the user space variable.
0606      */
0607     return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
0608 }
0609 
0610 /*
0611  * Caller must hold a reference on @pi_state.
0612  */
0613 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
0614 {
0615     struct rt_mutex_waiter *top_waiter;
0616     struct task_struct *new_owner;
0617     bool postunlock = false;
0618     DEFINE_RT_WAKE_Q(wqh);
0619     u32 curval, newval;
0620     int ret = 0;
0621 
0622     top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
0623     if (WARN_ON_ONCE(!top_waiter)) {
0624         /*
0625          * As per the comment in futex_unlock_pi() this should not happen.
0626          *
0627          * When this happens, give up our locks and try again, giving
0628          * the futex_lock_pi() instance time to complete, either by
0629          * waiting on the rtmutex or removing itself from the futex
0630          * queue.
0631          */
0632         ret = -EAGAIN;
0633         goto out_unlock;
0634     }
0635 
0636     new_owner = top_waiter->task;
0637 
0638     /*
0639      * We pass it to the next owner. The WAITERS bit is always kept
0640      * enabled while there is PI state around. We cleanup the owner
0641      * died bit, because we are the owner.
0642      */
0643     newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
0644 
0645     if (unlikely(should_fail_futex(true))) {
0646         ret = -EFAULT;
0647         goto out_unlock;
0648     }
0649 
0650     ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
0651     if (!ret && (curval != uval)) {
0652         /*
0653          * If a unconditional UNLOCK_PI operation (user space did not
0654          * try the TID->0 transition) raced with a waiter setting the
0655          * FUTEX_WAITERS flag between get_user() and locking the hash
0656          * bucket lock, retry the operation.
0657          */
0658         if ((FUTEX_TID_MASK & curval) == uval)
0659             ret = -EAGAIN;
0660         else
0661             ret = -EINVAL;
0662     }
0663 
0664     if (!ret) {
0665         /*
0666          * This is a point of no return; once we modified the uval
0667          * there is no going back and subsequent operations must
0668          * not fail.
0669          */
0670         pi_state_update_owner(pi_state, new_owner);
0671         postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
0672     }
0673 
0674 out_unlock:
0675     raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
0676 
0677     if (postunlock)
0678         rt_mutex_postunlock(&wqh);
0679 
0680     return ret;
0681 }
0682 
0683 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
0684                   struct task_struct *argowner)
0685 {
0686     struct futex_pi_state *pi_state = q->pi_state;
0687     struct task_struct *oldowner, *newowner;
0688     u32 uval, curval, newval, newtid;
0689     int err = 0;
0690 
0691     oldowner = pi_state->owner;
0692 
0693     /*
0694      * We are here because either:
0695      *
0696      *  - we stole the lock and pi_state->owner needs updating to reflect
0697      *    that (@argowner == current),
0698      *
0699      * or:
0700      *
0701      *  - someone stole our lock and we need to fix things to point to the
0702      *    new owner (@argowner == NULL).
0703      *
0704      * Either way, we have to replace the TID in the user space variable.
0705      * This must be atomic as we have to preserve the owner died bit here.
0706      *
0707      * Note: We write the user space value _before_ changing the pi_state
0708      * because we can fault here. Imagine swapped out pages or a fork
0709      * that marked all the anonymous memory readonly for cow.
0710      *
0711      * Modifying pi_state _before_ the user space value would leave the
0712      * pi_state in an inconsistent state when we fault here, because we
0713      * need to drop the locks to handle the fault. This might be observed
0714      * in the PID checks when attaching to PI state .
0715      */
0716 retry:
0717     if (!argowner) {
0718         if (oldowner != current) {
0719             /*
0720              * We raced against a concurrent self; things are
0721              * already fixed up. Nothing to do.
0722              */
0723             return 0;
0724         }
0725 
0726         if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
0727             /* We got the lock. pi_state is correct. Tell caller. */
0728             return 1;
0729         }
0730 
0731         /*
0732          * The trylock just failed, so either there is an owner or
0733          * there is a higher priority waiter than this one.
0734          */
0735         newowner = rt_mutex_owner(&pi_state->pi_mutex);
0736         /*
0737          * If the higher priority waiter has not yet taken over the
0738          * rtmutex then newowner is NULL. We can't return here with
0739          * that state because it's inconsistent vs. the user space
0740          * state. So drop the locks and try again. It's a valid
0741          * situation and not any different from the other retry
0742          * conditions.
0743          */
0744         if (unlikely(!newowner)) {
0745             err = -EAGAIN;
0746             goto handle_err;
0747         }
0748     } else {
0749         WARN_ON_ONCE(argowner != current);
0750         if (oldowner == current) {
0751             /*
0752              * We raced against a concurrent self; things are
0753              * already fixed up. Nothing to do.
0754              */
0755             return 1;
0756         }
0757         newowner = argowner;
0758     }
0759 
0760     newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
0761     /* Owner died? */
0762     if (!pi_state->owner)
0763         newtid |= FUTEX_OWNER_DIED;
0764 
0765     err = futex_get_value_locked(&uval, uaddr);
0766     if (err)
0767         goto handle_err;
0768 
0769     for (;;) {
0770         newval = (uval & FUTEX_OWNER_DIED) | newtid;
0771 
0772         err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
0773         if (err)
0774             goto handle_err;
0775 
0776         if (curval == uval)
0777             break;
0778         uval = curval;
0779     }
0780 
0781     /*
0782      * We fixed up user space. Now we need to fix the pi_state
0783      * itself.
0784      */
0785     pi_state_update_owner(pi_state, newowner);
0786 
0787     return argowner == current;
0788 
0789     /*
0790      * In order to reschedule or handle a page fault, we need to drop the
0791      * locks here. In the case of a fault, this gives the other task
0792      * (either the highest priority waiter itself or the task which stole
0793      * the rtmutex) the chance to try the fixup of the pi_state. So once we
0794      * are back from handling the fault we need to check the pi_state after
0795      * reacquiring the locks and before trying to do another fixup. When
0796      * the fixup has been done already we simply return.
0797      *
0798      * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
0799      * drop hb->lock since the caller owns the hb -> futex_q relation.
0800      * Dropping the pi_mutex->wait_lock requires the state revalidate.
0801      */
0802 handle_err:
0803     raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
0804     spin_unlock(q->lock_ptr);
0805 
0806     switch (err) {
0807     case -EFAULT:
0808         err = fault_in_user_writeable(uaddr);
0809         break;
0810 
0811     case -EAGAIN:
0812         cond_resched();
0813         err = 0;
0814         break;
0815 
0816     default:
0817         WARN_ON_ONCE(1);
0818         break;
0819     }
0820 
0821     spin_lock(q->lock_ptr);
0822     raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
0823 
0824     /*
0825      * Check if someone else fixed it for us:
0826      */
0827     if (pi_state->owner != oldowner)
0828         return argowner == current;
0829 
0830     /* Retry if err was -EAGAIN or the fault in succeeded */
0831     if (!err)
0832         goto retry;
0833 
0834     /*
0835      * fault_in_user_writeable() failed so user state is immutable. At
0836      * best we can make the kernel state consistent but user state will
0837      * be most likely hosed and any subsequent unlock operation will be
0838      * rejected due to PI futex rule [10].
0839      *
0840      * Ensure that the rtmutex owner is also the pi_state owner despite
0841      * the user space value claiming something different. There is no
0842      * point in unlocking the rtmutex if current is the owner as it
0843      * would need to wait until the next waiter has taken the rtmutex
0844      * to guarantee consistent state. Keep it simple. Userspace asked
0845      * for this wreckaged state.
0846      *
0847      * The rtmutex has an owner - either current or some other
0848      * task. See the EAGAIN loop above.
0849      */
0850     pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
0851 
0852     return err;
0853 }
0854 
0855 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
0856                 struct task_struct *argowner)
0857 {
0858     struct futex_pi_state *pi_state = q->pi_state;
0859     int ret;
0860 
0861     lockdep_assert_held(q->lock_ptr);
0862 
0863     raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
0864     ret = __fixup_pi_state_owner(uaddr, q, argowner);
0865     raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
0866     return ret;
0867 }
0868 
0869 /**
0870  * fixup_pi_owner() - Post lock pi_state and corner case management
0871  * @uaddr:  user address of the futex
0872  * @q:      futex_q (contains pi_state and access to the rt_mutex)
0873  * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
0874  *
0875  * After attempting to lock an rt_mutex, this function is called to cleanup
0876  * the pi_state owner as well as handle race conditions that may allow us to
0877  * acquire the lock. Must be called with the hb lock held.
0878  *
0879  * Return:
0880  *  -  1 - success, lock taken;
0881  *  -  0 - success, lock not taken;
0882  *  - <0 - on error (-EFAULT)
0883  */
0884 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
0885 {
0886     if (locked) {
0887         /*
0888          * Got the lock. We might not be the anticipated owner if we
0889          * did a lock-steal - fix up the PI-state in that case:
0890          *
0891          * Speculative pi_state->owner read (we don't hold wait_lock);
0892          * since we own the lock pi_state->owner == current is the
0893          * stable state, anything else needs more attention.
0894          */
0895         if (q->pi_state->owner != current)
0896             return fixup_pi_state_owner(uaddr, q, current);
0897         return 1;
0898     }
0899 
0900     /*
0901      * If we didn't get the lock; check if anybody stole it from us. In
0902      * that case, we need to fix up the uval to point to them instead of
0903      * us, otherwise bad things happen. [10]
0904      *
0905      * Another speculative read; pi_state->owner == current is unstable
0906      * but needs our attention.
0907      */
0908     if (q->pi_state->owner == current)
0909         return fixup_pi_state_owner(uaddr, q, NULL);
0910 
0911     /*
0912      * Paranoia check. If we did not take the lock, then we should not be
0913      * the owner of the rt_mutex. Warn and establish consistent state.
0914      */
0915     if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
0916         return fixup_pi_state_owner(uaddr, q, current);
0917 
0918     return 0;
0919 }
0920 
0921 /*
0922  * Userspace tried a 0 -> TID atomic transition of the futex value
0923  * and failed. The kernel side here does the whole locking operation:
0924  * if there are waiters then it will block as a consequence of relying
0925  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
0926  * a 0 value of the futex too.).
0927  *
0928  * Also serves as futex trylock_pi()'ing, and due semantics.
0929  */
0930 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
0931 {
0932     struct hrtimer_sleeper timeout, *to;
0933     struct task_struct *exiting = NULL;
0934     struct rt_mutex_waiter rt_waiter;
0935     struct futex_hash_bucket *hb;
0936     struct futex_q q = futex_q_init;
0937     int res, ret;
0938 
0939     if (!IS_ENABLED(CONFIG_FUTEX_PI))
0940         return -ENOSYS;
0941 
0942     if (refill_pi_state_cache())
0943         return -ENOMEM;
0944 
0945     to = futex_setup_timer(time, &timeout, flags, 0);
0946 
0947 retry:
0948     ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
0949     if (unlikely(ret != 0))
0950         goto out;
0951 
0952 retry_private:
0953     hb = futex_q_lock(&q);
0954 
0955     ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
0956                    &exiting, 0);
0957     if (unlikely(ret)) {
0958         /*
0959          * Atomic work succeeded and we got the lock,
0960          * or failed. Either way, we do _not_ block.
0961          */
0962         switch (ret) {
0963         case 1:
0964             /* We got the lock. */
0965             ret = 0;
0966             goto out_unlock_put_key;
0967         case -EFAULT:
0968             goto uaddr_faulted;
0969         case -EBUSY:
0970         case -EAGAIN:
0971             /*
0972              * Two reasons for this:
0973              * - EBUSY: Task is exiting and we just wait for the
0974              *   exit to complete.
0975              * - EAGAIN: The user space value changed.
0976              */
0977             futex_q_unlock(hb);
0978             /*
0979              * Handle the case where the owner is in the middle of
0980              * exiting. Wait for the exit to complete otherwise
0981              * this task might loop forever, aka. live lock.
0982              */
0983             wait_for_owner_exiting(ret, exiting);
0984             cond_resched();
0985             goto retry;
0986         default:
0987             goto out_unlock_put_key;
0988         }
0989     }
0990 
0991     WARN_ON(!q.pi_state);
0992 
0993     /*
0994      * Only actually queue now that the atomic ops are done:
0995      */
0996     __futex_queue(&q, hb);
0997 
0998     if (trylock) {
0999         ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000         /* Fixup the trylock return value: */
1001         ret = ret ? 0 : -EWOULDBLOCK;
1002         goto no_block;
1003     }
1004 
1005     rt_mutex_init_waiter(&rt_waiter);
1006 
1007     /*
1008      * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1009      * hold it while doing rt_mutex_start_proxy(), because then it will
1010      * include hb->lock in the blocking chain, even through we'll not in
1011      * fact hold it while blocking. This will lead it to report -EDEADLK
1012      * and BUG when futex_unlock_pi() interleaves with this.
1013      *
1014      * Therefore acquire wait_lock while holding hb->lock, but drop the
1015      * latter before calling __rt_mutex_start_proxy_lock(). This
1016      * interleaves with futex_unlock_pi() -- which does a similar lock
1017      * handoff -- such that the latter can observe the futex_q::pi_state
1018      * before __rt_mutex_start_proxy_lock() is done.
1019      */
1020     raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021     spin_unlock(q.lock_ptr);
1022     /*
1023      * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024      * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025      * it sees the futex_q::pi_state.
1026      */
1027     ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028     raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029 
1030     if (ret) {
1031         if (ret == 1)
1032             ret = 0;
1033         goto cleanup;
1034     }
1035 
1036     if (unlikely(to))
1037         hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038 
1039     ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040 
1041 cleanup:
1042     spin_lock(q.lock_ptr);
1043     /*
1044      * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045      * first acquire the hb->lock before removing the lock from the
1046      * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047      * lists consistent.
1048      *
1049      * In particular; it is important that futex_unlock_pi() can not
1050      * observe this inconsistency.
1051      */
1052     if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053         ret = 0;
1054 
1055 no_block:
1056     /*
1057      * Fixup the pi_state owner and possibly acquire the lock if we
1058      * haven't already.
1059      */
1060     res = fixup_pi_owner(uaddr, &q, !ret);
1061     /*
1062      * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1063      * the lock, clear our -ETIMEDOUT or -EINTR.
1064      */
1065     if (res)
1066         ret = (res < 0) ? res : 0;
1067 
1068     futex_unqueue_pi(&q);
1069     spin_unlock(q.lock_ptr);
1070     goto out;
1071 
1072 out_unlock_put_key:
1073     futex_q_unlock(hb);
1074 
1075 out:
1076     if (to) {
1077         hrtimer_cancel(&to->timer);
1078         destroy_hrtimer_on_stack(&to->timer);
1079     }
1080     return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081 
1082 uaddr_faulted:
1083     futex_q_unlock(hb);
1084 
1085     ret = fault_in_user_writeable(uaddr);
1086     if (ret)
1087         goto out;
1088 
1089     if (!(flags & FLAGS_SHARED))
1090         goto retry_private;
1091 
1092     goto retry;
1093 }
1094 
1095 /*
1096  * Userspace attempted a TID -> 0 atomic transition, and failed.
1097  * This is the in-kernel slowpath: we look up the PI state (if any),
1098  * and do the rt-mutex unlock.
1099  */
1100 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101 {
1102     u32 curval, uval, vpid = task_pid_vnr(current);
1103     union futex_key key = FUTEX_KEY_INIT;
1104     struct futex_hash_bucket *hb;
1105     struct futex_q *top_waiter;
1106     int ret;
1107 
1108     if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109         return -ENOSYS;
1110 
1111 retry:
1112     if (get_user(uval, uaddr))
1113         return -EFAULT;
1114     /*
1115      * We release only a lock we actually own:
1116      */
1117     if ((uval & FUTEX_TID_MASK) != vpid)
1118         return -EPERM;
1119 
1120     ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121     if (ret)
1122         return ret;
1123 
1124     hb = futex_hash(&key);
1125     spin_lock(&hb->lock);
1126 
1127     /*
1128      * Check waiters first. We do not trust user space values at
1129      * all and we at least want to know if user space fiddled
1130      * with the futex value instead of blindly unlocking.
1131      */
1132     top_waiter = futex_top_waiter(hb, &key);
1133     if (top_waiter) {
1134         struct futex_pi_state *pi_state = top_waiter->pi_state;
1135 
1136         ret = -EINVAL;
1137         if (!pi_state)
1138             goto out_unlock;
1139 
1140         /*
1141          * If current does not own the pi_state then the futex is
1142          * inconsistent and user space fiddled with the futex value.
1143          */
1144         if (pi_state->owner != current)
1145             goto out_unlock;
1146 
1147         get_pi_state(pi_state);
1148         /*
1149          * By taking wait_lock while still holding hb->lock, we ensure
1150          * there is no point where we hold neither; and therefore
1151          * wake_futex_p() must observe a state consistent with what we
1152          * observed.
1153          *
1154          * In particular; this forces __rt_mutex_start_proxy() to
1155          * complete such that we're guaranteed to observe the
1156          * rt_waiter. Also see the WARN in wake_futex_pi().
1157          */
1158         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159         spin_unlock(&hb->lock);
1160 
1161         /* drops pi_state->pi_mutex.wait_lock */
1162         ret = wake_futex_pi(uaddr, uval, pi_state);
1163 
1164         put_pi_state(pi_state);
1165 
1166         /*
1167          * Success, we're done! No tricky corner cases.
1168          */
1169         if (!ret)
1170             return ret;
1171         /*
1172          * The atomic access to the futex value generated a
1173          * pagefault, so retry the user-access and the wakeup:
1174          */
1175         if (ret == -EFAULT)
1176             goto pi_faulted;
1177         /*
1178          * A unconditional UNLOCK_PI op raced against a waiter
1179          * setting the FUTEX_WAITERS bit. Try again.
1180          */
1181         if (ret == -EAGAIN)
1182             goto pi_retry;
1183         /*
1184          * wake_futex_pi has detected invalid state. Tell user
1185          * space.
1186          */
1187         return ret;
1188     }
1189 
1190     /*
1191      * We have no kernel internal state, i.e. no waiters in the
1192      * kernel. Waiters which are about to queue themselves are stuck
1193      * on hb->lock. So we can safely ignore them. We do neither
1194      * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195      * owner.
1196      */
1197     if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198         spin_unlock(&hb->lock);
1199         switch (ret) {
1200         case -EFAULT:
1201             goto pi_faulted;
1202 
1203         case -EAGAIN:
1204             goto pi_retry;
1205 
1206         default:
1207             WARN_ON_ONCE(1);
1208             return ret;
1209         }
1210     }
1211 
1212     /*
1213      * If uval has changed, let user space handle it.
1214      */
1215     ret = (curval == uval) ? 0 : -EAGAIN;
1216 
1217 out_unlock:
1218     spin_unlock(&hb->lock);
1219     return ret;
1220 
1221 pi_retry:
1222     cond_resched();
1223     goto retry;
1224 
1225 pi_faulted:
1226 
1227     ret = fault_in_user_writeable(uaddr);
1228     if (!ret)
1229         goto retry;
1230 
1231     return ret;
1232 }
1233