![]() |
|
|||
0001 // SPDX-License-Identifier: GPL-2.0-or-later 0002 0003 #include <linux/slab.h> 0004 #include <linux/sched/task.h> 0005 0006 #include "futex.h" 0007 #include "../locking/rtmutex_common.h" 0008 0009 /* 0010 * PI code: 0011 */ 0012 int refill_pi_state_cache(void) 0013 { 0014 struct futex_pi_state *pi_state; 0015 0016 if (likely(current->pi_state_cache)) 0017 return 0; 0018 0019 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 0020 0021 if (!pi_state) 0022 return -ENOMEM; 0023 0024 INIT_LIST_HEAD(&pi_state->list); 0025 /* pi_mutex gets initialized later */ 0026 pi_state->owner = NULL; 0027 refcount_set(&pi_state->refcount, 1); 0028 pi_state->key = FUTEX_KEY_INIT; 0029 0030 current->pi_state_cache = pi_state; 0031 0032 return 0; 0033 } 0034 0035 static struct futex_pi_state *alloc_pi_state(void) 0036 { 0037 struct futex_pi_state *pi_state = current->pi_state_cache; 0038 0039 WARN_ON(!pi_state); 0040 current->pi_state_cache = NULL; 0041 0042 return pi_state; 0043 } 0044 0045 static void pi_state_update_owner(struct futex_pi_state *pi_state, 0046 struct task_struct *new_owner) 0047 { 0048 struct task_struct *old_owner = pi_state->owner; 0049 0050 lockdep_assert_held(&pi_state->pi_mutex.wait_lock); 0051 0052 if (old_owner) { 0053 raw_spin_lock(&old_owner->pi_lock); 0054 WARN_ON(list_empty(&pi_state->list)); 0055 list_del_init(&pi_state->list); 0056 raw_spin_unlock(&old_owner->pi_lock); 0057 } 0058 0059 if (new_owner) { 0060 raw_spin_lock(&new_owner->pi_lock); 0061 WARN_ON(!list_empty(&pi_state->list)); 0062 list_add(&pi_state->list, &new_owner->pi_state_list); 0063 pi_state->owner = new_owner; 0064 raw_spin_unlock(&new_owner->pi_lock); 0065 } 0066 } 0067 0068 void get_pi_state(struct futex_pi_state *pi_state) 0069 { 0070 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); 0071 } 0072 0073 /* 0074 * Drops a reference to the pi_state object and frees or caches it 0075 * when the last reference is gone. 0076 */ 0077 void put_pi_state(struct futex_pi_state *pi_state) 0078 { 0079 if (!pi_state) 0080 return; 0081 0082 if (!refcount_dec_and_test(&pi_state->refcount)) 0083 return; 0084 0085 /* 0086 * If pi_state->owner is NULL, the owner is most probably dying 0087 * and has cleaned up the pi_state already 0088 */ 0089 if (pi_state->owner) { 0090 unsigned long flags; 0091 0092 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); 0093 pi_state_update_owner(pi_state, NULL); 0094 rt_mutex_proxy_unlock(&pi_state->pi_mutex); 0095 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); 0096 } 0097 0098 if (current->pi_state_cache) { 0099 kfree(pi_state); 0100 } else { 0101 /* 0102 * pi_state->list is already empty. 0103 * clear pi_state->owner. 0104 * refcount is at 0 - put it back to 1. 0105 */ 0106 pi_state->owner = NULL; 0107 refcount_set(&pi_state->refcount, 1); 0108 current->pi_state_cache = pi_state; 0109 } 0110 } 0111 0112 /* 0113 * We need to check the following states: 0114 * 0115 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 0116 * 0117 * [1] NULL | --- | --- | 0 | 0/1 | Valid 0118 * [2] NULL | --- | --- | >0 | 0/1 | Valid 0119 * 0120 * [3] Found | NULL | -- | Any | 0/1 | Invalid 0121 * 0122 * [4] Found | Found | NULL | 0 | 1 | Valid 0123 * [5] Found | Found | NULL | >0 | 1 | Invalid 0124 * 0125 * [6] Found | Found | task | 0 | 1 | Valid 0126 * 0127 * [7] Found | Found | NULL | Any | 0 | Invalid 0128 * 0129 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 0130 * [9] Found | Found | task | 0 | 0 | Invalid 0131 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 0132 * 0133 * [1] Indicates that the kernel can acquire the futex atomically. We 0134 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 0135 * 0136 * [2] Valid, if TID does not belong to a kernel thread. If no matching 0137 * thread is found then it indicates that the owner TID has died. 0138 * 0139 * [3] Invalid. The waiter is queued on a non PI futex 0140 * 0141 * [4] Valid state after exit_robust_list(), which sets the user space 0142 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 0143 * 0144 * [5] The user space value got manipulated between exit_robust_list() 0145 * and exit_pi_state_list() 0146 * 0147 * [6] Valid state after exit_pi_state_list() which sets the new owner in 0148 * the pi_state but cannot access the user space value. 0149 * 0150 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 0151 * 0152 * [8] Owner and user space value match 0153 * 0154 * [9] There is no transient state which sets the user space TID to 0 0155 * except exit_robust_list(), but this is indicated by the 0156 * FUTEX_OWNER_DIED bit. See [4] 0157 * 0158 * [10] There is no transient state which leaves owner and user space 0159 * TID out of sync. Except one error case where the kernel is denied 0160 * write access to the user address, see fixup_pi_state_owner(). 0161 * 0162 * 0163 * Serialization and lifetime rules: 0164 * 0165 * hb->lock: 0166 * 0167 * hb -> futex_q, relation 0168 * futex_q -> pi_state, relation 0169 * 0170 * (cannot be raw because hb can contain arbitrary amount 0171 * of futex_q's) 0172 * 0173 * pi_mutex->wait_lock: 0174 * 0175 * {uval, pi_state} 0176 * 0177 * (and pi_mutex 'obviously') 0178 * 0179 * p->pi_lock: 0180 * 0181 * p->pi_state_list -> pi_state->list, relation 0182 * pi_mutex->owner -> pi_state->owner, relation 0183 * 0184 * pi_state->refcount: 0185 * 0186 * pi_state lifetime 0187 * 0188 * 0189 * Lock order: 0190 * 0191 * hb->lock 0192 * pi_mutex->wait_lock 0193 * p->pi_lock 0194 * 0195 */ 0196 0197 /* 0198 * Validate that the existing waiter has a pi_state and sanity check 0199 * the pi_state against the user space value. If correct, attach to 0200 * it. 0201 */ 0202 static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 0203 struct futex_pi_state *pi_state, 0204 struct futex_pi_state **ps) 0205 { 0206 pid_t pid = uval & FUTEX_TID_MASK; 0207 u32 uval2; 0208 int ret; 0209 0210 /* 0211 * Userspace might have messed up non-PI and PI futexes [3] 0212 */ 0213 if (unlikely(!pi_state)) 0214 return -EINVAL; 0215 0216 /* 0217 * We get here with hb->lock held, and having found a 0218 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 0219 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), 0220 * which in turn means that futex_lock_pi() still has a reference on 0221 * our pi_state. 0222 * 0223 * The waiter holding a reference on @pi_state also protects against 0224 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 0225 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 0226 * free pi_state before we can take a reference ourselves. 0227 */ 0228 WARN_ON(!refcount_read(&pi_state->refcount)); 0229 0230 /* 0231 * Now that we have a pi_state, we can acquire wait_lock 0232 * and do the state validation. 0233 */ 0234 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 0235 0236 /* 0237 * Since {uval, pi_state} is serialized by wait_lock, and our current 0238 * uval was read without holding it, it can have changed. Verify it 0239 * still is what we expect it to be, otherwise retry the entire 0240 * operation. 0241 */ 0242 if (futex_get_value_locked(&uval2, uaddr)) 0243 goto out_efault; 0244 0245 if (uval != uval2) 0246 goto out_eagain; 0247 0248 /* 0249 * Handle the owner died case: 0250 */ 0251 if (uval & FUTEX_OWNER_DIED) { 0252 /* 0253 * exit_pi_state_list sets owner to NULL and wakes the 0254 * topmost waiter. The task which acquires the 0255 * pi_state->rt_mutex will fixup owner. 0256 */ 0257 if (!pi_state->owner) { 0258 /* 0259 * No pi state owner, but the user space TID 0260 * is not 0. Inconsistent state. [5] 0261 */ 0262 if (pid) 0263 goto out_einval; 0264 /* 0265 * Take a ref on the state and return success. [4] 0266 */ 0267 goto out_attach; 0268 } 0269 0270 /* 0271 * If TID is 0, then either the dying owner has not 0272 * yet executed exit_pi_state_list() or some waiter 0273 * acquired the rtmutex in the pi state, but did not 0274 * yet fixup the TID in user space. 0275 * 0276 * Take a ref on the state and return success. [6] 0277 */ 0278 if (!pid) 0279 goto out_attach; 0280 } else { 0281 /* 0282 * If the owner died bit is not set, then the pi_state 0283 * must have an owner. [7] 0284 */ 0285 if (!pi_state->owner) 0286 goto out_einval; 0287 } 0288 0289 /* 0290 * Bail out if user space manipulated the futex value. If pi 0291 * state exists then the owner TID must be the same as the 0292 * user space TID. [9/10] 0293 */ 0294 if (pid != task_pid_vnr(pi_state->owner)) 0295 goto out_einval; 0296 0297 out_attach: 0298 get_pi_state(pi_state); 0299 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 0300 *ps = pi_state; 0301 return 0; 0302 0303 out_einval: 0304 ret = -EINVAL; 0305 goto out_error; 0306 0307 out_eagain: 0308 ret = -EAGAIN; 0309 goto out_error; 0310 0311 out_efault: 0312 ret = -EFAULT; 0313 goto out_error; 0314 0315 out_error: 0316 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 0317 return ret; 0318 } 0319 0320 static int handle_exit_race(u32 __user *uaddr, u32 uval, 0321 struct task_struct *tsk) 0322 { 0323 u32 uval2; 0324 0325 /* 0326 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the 0327 * caller that the alleged owner is busy. 0328 */ 0329 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) 0330 return -EBUSY; 0331 0332 /* 0333 * Reread the user space value to handle the following situation: 0334 * 0335 * CPU0 CPU1 0336 * 0337 * sys_exit() sys_futex() 0338 * do_exit() futex_lock_pi() 0339 * futex_lock_pi_atomic() 0340 * exit_signals(tsk) No waiters: 0341 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID 0342 * mm_release(tsk) Set waiter bit 0343 * exit_robust_list(tsk) { *uaddr = 0x80000PID; 0344 * Set owner died attach_to_pi_owner() { 0345 * *uaddr = 0xC0000000; tsk = get_task(PID); 0346 * } if (!tsk->flags & PF_EXITING) { 0347 * ... attach(); 0348 * tsk->futex_state = } else { 0349 * FUTEX_STATE_DEAD; if (tsk->futex_state != 0350 * FUTEX_STATE_DEAD) 0351 * return -EAGAIN; 0352 * return -ESRCH; <--- FAIL 0353 * } 0354 * 0355 * Returning ESRCH unconditionally is wrong here because the 0356 * user space value has been changed by the exiting task. 0357 * 0358 * The same logic applies to the case where the exiting task is 0359 * already gone. 0360 */ 0361 if (futex_get_value_locked(&uval2, uaddr)) 0362 return -EFAULT; 0363 0364 /* If the user space value has changed, try again. */ 0365 if (uval2 != uval) 0366 return -EAGAIN; 0367 0368 /* 0369 * The exiting task did not have a robust list, the robust list was 0370 * corrupted or the user space value in *uaddr is simply bogus. 0371 * Give up and tell user space. 0372 */ 0373 return -ESRCH; 0374 } 0375 0376 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, 0377 struct futex_pi_state **ps) 0378 { 0379 /* 0380 * No existing pi state. First waiter. [2] 0381 * 0382 * This creates pi_state, we have hb->lock held, this means nothing can 0383 * observe this state, wait_lock is irrelevant. 0384 */ 0385 struct futex_pi_state *pi_state = alloc_pi_state(); 0386 0387 /* 0388 * Initialize the pi_mutex in locked state and make @p 0389 * the owner of it: 0390 */ 0391 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 0392 0393 /* Store the key for possible exit cleanups: */ 0394 pi_state->key = *key; 0395 0396 WARN_ON(!list_empty(&pi_state->list)); 0397 list_add(&pi_state->list, &p->pi_state_list); 0398 /* 0399 * Assignment without holding pi_state->pi_mutex.wait_lock is safe 0400 * because there is no concurrency as the object is not published yet. 0401 */ 0402 pi_state->owner = p; 0403 0404 *ps = pi_state; 0405 } 0406 /* 0407 * Lookup the task for the TID provided from user space and attach to 0408 * it after doing proper sanity checks. 0409 */ 0410 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, 0411 struct futex_pi_state **ps, 0412 struct task_struct **exiting) 0413 { 0414 pid_t pid = uval & FUTEX_TID_MASK; 0415 struct task_struct *p; 0416 0417 /* 0418 * We are the first waiter - try to look up the real owner and attach 0419 * the new pi_state to it, but bail out when TID = 0 [1] 0420 * 0421 * The !pid check is paranoid. None of the call sites should end up 0422 * with pid == 0, but better safe than sorry. Let the caller retry 0423 */ 0424 if (!pid) 0425 return -EAGAIN; 0426 p = find_get_task_by_vpid(pid); 0427 if (!p) 0428 return handle_exit_race(uaddr, uval, NULL); 0429 0430 if (unlikely(p->flags & PF_KTHREAD)) { 0431 put_task_struct(p); 0432 return -EPERM; 0433 } 0434 0435 /* 0436 * We need to look at the task state to figure out, whether the 0437 * task is exiting. To protect against the change of the task state 0438 * in futex_exit_release(), we do this protected by p->pi_lock: 0439 */ 0440 raw_spin_lock_irq(&p->pi_lock); 0441 if (unlikely(p->futex_state != FUTEX_STATE_OK)) { 0442 /* 0443 * The task is on the way out. When the futex state is 0444 * FUTEX_STATE_DEAD, we know that the task has finished 0445 * the cleanup: 0446 */ 0447 int ret = handle_exit_race(uaddr, uval, p); 0448 0449 raw_spin_unlock_irq(&p->pi_lock); 0450 /* 0451 * If the owner task is between FUTEX_STATE_EXITING and 0452 * FUTEX_STATE_DEAD then store the task pointer and keep 0453 * the reference on the task struct. The calling code will 0454 * drop all locks, wait for the task to reach 0455 * FUTEX_STATE_DEAD and then drop the refcount. This is 0456 * required to prevent a live lock when the current task 0457 * preempted the exiting task between the two states. 0458 */ 0459 if (ret == -EBUSY) 0460 *exiting = p; 0461 else 0462 put_task_struct(p); 0463 return ret; 0464 } 0465 0466 __attach_to_pi_owner(p, key, ps); 0467 raw_spin_unlock_irq(&p->pi_lock); 0468 0469 put_task_struct(p); 0470 0471 return 0; 0472 } 0473 0474 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 0475 { 0476 int err; 0477 u32 curval; 0478 0479 if (unlikely(should_fail_futex(true))) 0480 return -EFAULT; 0481 0482 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 0483 if (unlikely(err)) 0484 return err; 0485 0486 /* If user space value changed, let the caller retry */ 0487 return curval != uval ? -EAGAIN : 0; 0488 } 0489 0490 /** 0491 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 0492 * @uaddr: the pi futex user address 0493 * @hb: the pi futex hash bucket 0494 * @key: the futex key associated with uaddr and hb 0495 * @ps: the pi_state pointer where we store the result of the 0496 * lookup 0497 * @task: the task to perform the atomic lock work for. This will 0498 * be "current" except in the case of requeue pi. 0499 * @exiting: Pointer to store the task pointer of the owner task 0500 * which is in the middle of exiting 0501 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 0502 * 0503 * Return: 0504 * - 0 - ready to wait; 0505 * - 1 - acquired the lock; 0506 * - <0 - error 0507 * 0508 * The hb->lock must be held by the caller. 0509 * 0510 * @exiting is only set when the return value is -EBUSY. If so, this holds 0511 * a refcount on the exiting task on return and the caller needs to drop it 0512 * after waiting for the exit to complete. 0513 */ 0514 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 0515 union futex_key *key, 0516 struct futex_pi_state **ps, 0517 struct task_struct *task, 0518 struct task_struct **exiting, 0519 int set_waiters) 0520 { 0521 u32 uval, newval, vpid = task_pid_vnr(task); 0522 struct futex_q *top_waiter; 0523 int ret; 0524 0525 /* 0526 * Read the user space value first so we can validate a few 0527 * things before proceeding further. 0528 */ 0529 if (futex_get_value_locked(&uval, uaddr)) 0530 return -EFAULT; 0531 0532 if (unlikely(should_fail_futex(true))) 0533 return -EFAULT; 0534 0535 /* 0536 * Detect deadlocks. 0537 */ 0538 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 0539 return -EDEADLK; 0540 0541 if ((unlikely(should_fail_futex(true)))) 0542 return -EDEADLK; 0543 0544 /* 0545 * Lookup existing state first. If it exists, try to attach to 0546 * its pi_state. 0547 */ 0548 top_waiter = futex_top_waiter(hb, key); 0549 if (top_waiter) 0550 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 0551 0552 /* 0553 * No waiter and user TID is 0. We are here because the 0554 * waiters or the owner died bit is set or called from 0555 * requeue_cmp_pi or for whatever reason something took the 0556 * syscall. 0557 */ 0558 if (!(uval & FUTEX_TID_MASK)) { 0559 /* 0560 * We take over the futex. No other waiters and the user space 0561 * TID is 0. We preserve the owner died bit. 0562 */ 0563 newval = uval & FUTEX_OWNER_DIED; 0564 newval |= vpid; 0565 0566 /* The futex requeue_pi code can enforce the waiters bit */ 0567 if (set_waiters) 0568 newval |= FUTEX_WAITERS; 0569 0570 ret = lock_pi_update_atomic(uaddr, uval, newval); 0571 if (ret) 0572 return ret; 0573 0574 /* 0575 * If the waiter bit was requested the caller also needs PI 0576 * state attached to the new owner of the user space futex. 0577 * 0578 * @task is guaranteed to be alive and it cannot be exiting 0579 * because it is either sleeping or waiting in 0580 * futex_requeue_pi_wakeup_sync(). 0581 * 0582 * No need to do the full attach_to_pi_owner() exercise 0583 * because @task is known and valid. 0584 */ 0585 if (set_waiters) { 0586 raw_spin_lock_irq(&task->pi_lock); 0587 __attach_to_pi_owner(task, key, ps); 0588 raw_spin_unlock_irq(&task->pi_lock); 0589 } 0590 return 1; 0591 } 0592 0593 /* 0594 * First waiter. Set the waiters bit before attaching ourself to 0595 * the owner. If owner tries to unlock, it will be forced into 0596 * the kernel and blocked on hb->lock. 0597 */ 0598 newval = uval | FUTEX_WAITERS; 0599 ret = lock_pi_update_atomic(uaddr, uval, newval); 0600 if (ret) 0601 return ret; 0602 /* 0603 * If the update of the user space value succeeded, we try to 0604 * attach to the owner. If that fails, no harm done, we only 0605 * set the FUTEX_WAITERS bit in the user space variable. 0606 */ 0607 return attach_to_pi_owner(uaddr, newval, key, ps, exiting); 0608 } 0609 0610 /* 0611 * Caller must hold a reference on @pi_state. 0612 */ 0613 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) 0614 { 0615 struct rt_mutex_waiter *top_waiter; 0616 struct task_struct *new_owner; 0617 bool postunlock = false; 0618 DEFINE_RT_WAKE_Q(wqh); 0619 u32 curval, newval; 0620 int ret = 0; 0621 0622 top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); 0623 if (WARN_ON_ONCE(!top_waiter)) { 0624 /* 0625 * As per the comment in futex_unlock_pi() this should not happen. 0626 * 0627 * When this happens, give up our locks and try again, giving 0628 * the futex_lock_pi() instance time to complete, either by 0629 * waiting on the rtmutex or removing itself from the futex 0630 * queue. 0631 */ 0632 ret = -EAGAIN; 0633 goto out_unlock; 0634 } 0635 0636 new_owner = top_waiter->task; 0637 0638 /* 0639 * We pass it to the next owner. The WAITERS bit is always kept 0640 * enabled while there is PI state around. We cleanup the owner 0641 * died bit, because we are the owner. 0642 */ 0643 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 0644 0645 if (unlikely(should_fail_futex(true))) { 0646 ret = -EFAULT; 0647 goto out_unlock; 0648 } 0649 0650 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 0651 if (!ret && (curval != uval)) { 0652 /* 0653 * If a unconditional UNLOCK_PI operation (user space did not 0654 * try the TID->0 transition) raced with a waiter setting the 0655 * FUTEX_WAITERS flag between get_user() and locking the hash 0656 * bucket lock, retry the operation. 0657 */ 0658 if ((FUTEX_TID_MASK & curval) == uval) 0659 ret = -EAGAIN; 0660 else 0661 ret = -EINVAL; 0662 } 0663 0664 if (!ret) { 0665 /* 0666 * This is a point of no return; once we modified the uval 0667 * there is no going back and subsequent operations must 0668 * not fail. 0669 */ 0670 pi_state_update_owner(pi_state, new_owner); 0671 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); 0672 } 0673 0674 out_unlock: 0675 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 0676 0677 if (postunlock) 0678 rt_mutex_postunlock(&wqh); 0679 0680 return ret; 0681 } 0682 0683 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 0684 struct task_struct *argowner) 0685 { 0686 struct futex_pi_state *pi_state = q->pi_state; 0687 struct task_struct *oldowner, *newowner; 0688 u32 uval, curval, newval, newtid; 0689 int err = 0; 0690 0691 oldowner = pi_state->owner; 0692 0693 /* 0694 * We are here because either: 0695 * 0696 * - we stole the lock and pi_state->owner needs updating to reflect 0697 * that (@argowner == current), 0698 * 0699 * or: 0700 * 0701 * - someone stole our lock and we need to fix things to point to the 0702 * new owner (@argowner == NULL). 0703 * 0704 * Either way, we have to replace the TID in the user space variable. 0705 * This must be atomic as we have to preserve the owner died bit here. 0706 * 0707 * Note: We write the user space value _before_ changing the pi_state 0708 * because we can fault here. Imagine swapped out pages or a fork 0709 * that marked all the anonymous memory readonly for cow. 0710 * 0711 * Modifying pi_state _before_ the user space value would leave the 0712 * pi_state in an inconsistent state when we fault here, because we 0713 * need to drop the locks to handle the fault. This might be observed 0714 * in the PID checks when attaching to PI state . 0715 */ 0716 retry: 0717 if (!argowner) { 0718 if (oldowner != current) { 0719 /* 0720 * We raced against a concurrent self; things are 0721 * already fixed up. Nothing to do. 0722 */ 0723 return 0; 0724 } 0725 0726 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { 0727 /* We got the lock. pi_state is correct. Tell caller. */ 0728 return 1; 0729 } 0730 0731 /* 0732 * The trylock just failed, so either there is an owner or 0733 * there is a higher priority waiter than this one. 0734 */ 0735 newowner = rt_mutex_owner(&pi_state->pi_mutex); 0736 /* 0737 * If the higher priority waiter has not yet taken over the 0738 * rtmutex then newowner is NULL. We can't return here with 0739 * that state because it's inconsistent vs. the user space 0740 * state. So drop the locks and try again. It's a valid 0741 * situation and not any different from the other retry 0742 * conditions. 0743 */ 0744 if (unlikely(!newowner)) { 0745 err = -EAGAIN; 0746 goto handle_err; 0747 } 0748 } else { 0749 WARN_ON_ONCE(argowner != current); 0750 if (oldowner == current) { 0751 /* 0752 * We raced against a concurrent self; things are 0753 * already fixed up. Nothing to do. 0754 */ 0755 return 1; 0756 } 0757 newowner = argowner; 0758 } 0759 0760 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 0761 /* Owner died? */ 0762 if (!pi_state->owner) 0763 newtid |= FUTEX_OWNER_DIED; 0764 0765 err = futex_get_value_locked(&uval, uaddr); 0766 if (err) 0767 goto handle_err; 0768 0769 for (;;) { 0770 newval = (uval & FUTEX_OWNER_DIED) | newtid; 0771 0772 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 0773 if (err) 0774 goto handle_err; 0775 0776 if (curval == uval) 0777 break; 0778 uval = curval; 0779 } 0780 0781 /* 0782 * We fixed up user space. Now we need to fix the pi_state 0783 * itself. 0784 */ 0785 pi_state_update_owner(pi_state, newowner); 0786 0787 return argowner == current; 0788 0789 /* 0790 * In order to reschedule or handle a page fault, we need to drop the 0791 * locks here. In the case of a fault, this gives the other task 0792 * (either the highest priority waiter itself or the task which stole 0793 * the rtmutex) the chance to try the fixup of the pi_state. So once we 0794 * are back from handling the fault we need to check the pi_state after 0795 * reacquiring the locks and before trying to do another fixup. When 0796 * the fixup has been done already we simply return. 0797 * 0798 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 0799 * drop hb->lock since the caller owns the hb -> futex_q relation. 0800 * Dropping the pi_mutex->wait_lock requires the state revalidate. 0801 */ 0802 handle_err: 0803 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 0804 spin_unlock(q->lock_ptr); 0805 0806 switch (err) { 0807 case -EFAULT: 0808 err = fault_in_user_writeable(uaddr); 0809 break; 0810 0811 case -EAGAIN: 0812 cond_resched(); 0813 err = 0; 0814 break; 0815 0816 default: 0817 WARN_ON_ONCE(1); 0818 break; 0819 } 0820 0821 spin_lock(q->lock_ptr); 0822 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 0823 0824 /* 0825 * Check if someone else fixed it for us: 0826 */ 0827 if (pi_state->owner != oldowner) 0828 return argowner == current; 0829 0830 /* Retry if err was -EAGAIN or the fault in succeeded */ 0831 if (!err) 0832 goto retry; 0833 0834 /* 0835 * fault_in_user_writeable() failed so user state is immutable. At 0836 * best we can make the kernel state consistent but user state will 0837 * be most likely hosed and any subsequent unlock operation will be 0838 * rejected due to PI futex rule [10]. 0839 * 0840 * Ensure that the rtmutex owner is also the pi_state owner despite 0841 * the user space value claiming something different. There is no 0842 * point in unlocking the rtmutex if current is the owner as it 0843 * would need to wait until the next waiter has taken the rtmutex 0844 * to guarantee consistent state. Keep it simple. Userspace asked 0845 * for this wreckaged state. 0846 * 0847 * The rtmutex has an owner - either current or some other 0848 * task. See the EAGAIN loop above. 0849 */ 0850 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); 0851 0852 return err; 0853 } 0854 0855 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 0856 struct task_struct *argowner) 0857 { 0858 struct futex_pi_state *pi_state = q->pi_state; 0859 int ret; 0860 0861 lockdep_assert_held(q->lock_ptr); 0862 0863 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 0864 ret = __fixup_pi_state_owner(uaddr, q, argowner); 0865 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 0866 return ret; 0867 } 0868 0869 /** 0870 * fixup_pi_owner() - Post lock pi_state and corner case management 0871 * @uaddr: user address of the futex 0872 * @q: futex_q (contains pi_state and access to the rt_mutex) 0873 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 0874 * 0875 * After attempting to lock an rt_mutex, this function is called to cleanup 0876 * the pi_state owner as well as handle race conditions that may allow us to 0877 * acquire the lock. Must be called with the hb lock held. 0878 * 0879 * Return: 0880 * - 1 - success, lock taken; 0881 * - 0 - success, lock not taken; 0882 * - <0 - on error (-EFAULT) 0883 */ 0884 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) 0885 { 0886 if (locked) { 0887 /* 0888 * Got the lock. We might not be the anticipated owner if we 0889 * did a lock-steal - fix up the PI-state in that case: 0890 * 0891 * Speculative pi_state->owner read (we don't hold wait_lock); 0892 * since we own the lock pi_state->owner == current is the 0893 * stable state, anything else needs more attention. 0894 */ 0895 if (q->pi_state->owner != current) 0896 return fixup_pi_state_owner(uaddr, q, current); 0897 return 1; 0898 } 0899 0900 /* 0901 * If we didn't get the lock; check if anybody stole it from us. In 0902 * that case, we need to fix up the uval to point to them instead of 0903 * us, otherwise bad things happen. [10] 0904 * 0905 * Another speculative read; pi_state->owner == current is unstable 0906 * but needs our attention. 0907 */ 0908 if (q->pi_state->owner == current) 0909 return fixup_pi_state_owner(uaddr, q, NULL); 0910 0911 /* 0912 * Paranoia check. If we did not take the lock, then we should not be 0913 * the owner of the rt_mutex. Warn and establish consistent state. 0914 */ 0915 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) 0916 return fixup_pi_state_owner(uaddr, q, current); 0917 0918 return 0; 0919 } 0920 0921 /* 0922 * Userspace tried a 0 -> TID atomic transition of the futex value 0923 * and failed. The kernel side here does the whole locking operation: 0924 * if there are waiters then it will block as a consequence of relying 0925 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 0926 * a 0 value of the futex too.). 0927 * 0928 * Also serves as futex trylock_pi()'ing, and due semantics. 0929 */ 0930 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) 0931 { 0932 struct hrtimer_sleeper timeout, *to; 0933 struct task_struct *exiting = NULL; 0934 struct rt_mutex_waiter rt_waiter; 0935 struct futex_hash_bucket *hb; 0936 struct futex_q q = futex_q_init; 0937 int res, ret; 0938 0939 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 0940 return -ENOSYS; 0941 0942 if (refill_pi_state_cache()) 0943 return -ENOMEM; 0944 0945 to = futex_setup_timer(time, &timeout, flags, 0); 0946 0947 retry: 0948 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); 0949 if (unlikely(ret != 0)) 0950 goto out; 0951 0952 retry_private: 0953 hb = futex_q_lock(&q); 0954 0955 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0956 &exiting, 0); 0957 if (unlikely(ret)) { 0958 /* 0959 * Atomic work succeeded and we got the lock, 0960 * or failed. Either way, we do _not_ block. 0961 */ 0962 switch (ret) { 0963 case 1: 0964 /* We got the lock. */ 0965 ret = 0; 0966 goto out_unlock_put_key; 0967 case -EFAULT: 0968 goto uaddr_faulted; 0969 case -EBUSY: 0970 case -EAGAIN: 0971 /* 0972 * Two reasons for this: 0973 * - EBUSY: Task is exiting and we just wait for the 0974 * exit to complete. 0975 * - EAGAIN: The user space value changed. 0976 */ 0977 futex_q_unlock(hb); 0978 /* 0979 * Handle the case where the owner is in the middle of 0980 * exiting. Wait for the exit to complete otherwise 0981 * this task might loop forever, aka. live lock. 0982 */ 0983 wait_for_owner_exiting(ret, exiting); 0984 cond_resched(); 0985 goto retry; 0986 default: 0987 goto out_unlock_put_key; 0988 } 0989 } 0990 0991 WARN_ON(!q.pi_state); 0992 0993 /* 0994 * Only actually queue now that the atomic ops are done: 0995 */ 0996 __futex_queue(&q, hb); 0997 0998 if (trylock) { 0999 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 1000 /* Fixup the trylock return value: */ 1001 ret = ret ? 0 : -EWOULDBLOCK; 1002 goto no_block; 1003 } 1004 1005 rt_mutex_init_waiter(&rt_waiter); 1006 1007 /* 1008 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1009 * hold it while doing rt_mutex_start_proxy(), because then it will 1010 * include hb->lock in the blocking chain, even through we'll not in 1011 * fact hold it while blocking. This will lead it to report -EDEADLK 1012 * and BUG when futex_unlock_pi() interleaves with this. 1013 * 1014 * Therefore acquire wait_lock while holding hb->lock, but drop the 1015 * latter before calling __rt_mutex_start_proxy_lock(). This 1016 * interleaves with futex_unlock_pi() -- which does a similar lock 1017 * handoff -- such that the latter can observe the futex_q::pi_state 1018 * before __rt_mutex_start_proxy_lock() is done. 1019 */ 1020 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1021 spin_unlock(q.lock_ptr); 1022 /* 1023 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1024 * such that futex_unlock_pi() is guaranteed to observe the waiter when 1025 * it sees the futex_q::pi_state. 1026 */ 1027 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 1028 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 1029 1030 if (ret) { 1031 if (ret == 1) 1032 ret = 0; 1033 goto cleanup; 1034 } 1035 1036 if (unlikely(to)) 1037 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1038 1039 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1040 1041 cleanup: 1042 spin_lock(q.lock_ptr); 1043 /* 1044 * If we failed to acquire the lock (deadlock/signal/timeout), we must 1045 * first acquire the hb->lock before removing the lock from the 1046 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait 1047 * lists consistent. 1048 * 1049 * In particular; it is important that futex_unlock_pi() can not 1050 * observe this inconsistency. 1051 */ 1052 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1053 ret = 0; 1054 1055 no_block: 1056 /* 1057 * Fixup the pi_state owner and possibly acquire the lock if we 1058 * haven't already. 1059 */ 1060 res = fixup_pi_owner(uaddr, &q, !ret); 1061 /* 1062 * If fixup_pi_owner() returned an error, propagate that. If it acquired 1063 * the lock, clear our -ETIMEDOUT or -EINTR. 1064 */ 1065 if (res) 1066 ret = (res < 0) ? res : 0; 1067 1068 futex_unqueue_pi(&q); 1069 spin_unlock(q.lock_ptr); 1070 goto out; 1071 1072 out_unlock_put_key: 1073 futex_q_unlock(hb); 1074 1075 out: 1076 if (to) { 1077 hrtimer_cancel(&to->timer); 1078 destroy_hrtimer_on_stack(&to->timer); 1079 } 1080 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1081 1082 uaddr_faulted: 1083 futex_q_unlock(hb); 1084 1085 ret = fault_in_user_writeable(uaddr); 1086 if (ret) 1087 goto out; 1088 1089 if (!(flags & FLAGS_SHARED)) 1090 goto retry_private; 1091 1092 goto retry; 1093 } 1094 1095 /* 1096 * Userspace attempted a TID -> 0 atomic transition, and failed. 1097 * This is the in-kernel slowpath: we look up the PI state (if any), 1098 * and do the rt-mutex unlock. 1099 */ 1100 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 1101 { 1102 u32 curval, uval, vpid = task_pid_vnr(current); 1103 union futex_key key = FUTEX_KEY_INIT; 1104 struct futex_hash_bucket *hb; 1105 struct futex_q *top_waiter; 1106 int ret; 1107 1108 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 1109 return -ENOSYS; 1110 1111 retry: 1112 if (get_user(uval, uaddr)) 1113 return -EFAULT; 1114 /* 1115 * We release only a lock we actually own: 1116 */ 1117 if ((uval & FUTEX_TID_MASK) != vpid) 1118 return -EPERM; 1119 1120 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); 1121 if (ret) 1122 return ret; 1123 1124 hb = futex_hash(&key); 1125 spin_lock(&hb->lock); 1126 1127 /* 1128 * Check waiters first. We do not trust user space values at 1129 * all and we at least want to know if user space fiddled 1130 * with the futex value instead of blindly unlocking. 1131 */ 1132 top_waiter = futex_top_waiter(hb, &key); 1133 if (top_waiter) { 1134 struct futex_pi_state *pi_state = top_waiter->pi_state; 1135 1136 ret = -EINVAL; 1137 if (!pi_state) 1138 goto out_unlock; 1139 1140 /* 1141 * If current does not own the pi_state then the futex is 1142 * inconsistent and user space fiddled with the futex value. 1143 */ 1144 if (pi_state->owner != current) 1145 goto out_unlock; 1146 1147 get_pi_state(pi_state); 1148 /* 1149 * By taking wait_lock while still holding hb->lock, we ensure 1150 * there is no point where we hold neither; and therefore 1151 * wake_futex_p() must observe a state consistent with what we 1152 * observed. 1153 * 1154 * In particular; this forces __rt_mutex_start_proxy() to 1155 * complete such that we're guaranteed to observe the 1156 * rt_waiter. Also see the WARN in wake_futex_pi(). 1157 */ 1158 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1159 spin_unlock(&hb->lock); 1160 1161 /* drops pi_state->pi_mutex.wait_lock */ 1162 ret = wake_futex_pi(uaddr, uval, pi_state); 1163 1164 put_pi_state(pi_state); 1165 1166 /* 1167 * Success, we're done! No tricky corner cases. 1168 */ 1169 if (!ret) 1170 return ret; 1171 /* 1172 * The atomic access to the futex value generated a 1173 * pagefault, so retry the user-access and the wakeup: 1174 */ 1175 if (ret == -EFAULT) 1176 goto pi_faulted; 1177 /* 1178 * A unconditional UNLOCK_PI op raced against a waiter 1179 * setting the FUTEX_WAITERS bit. Try again. 1180 */ 1181 if (ret == -EAGAIN) 1182 goto pi_retry; 1183 /* 1184 * wake_futex_pi has detected invalid state. Tell user 1185 * space. 1186 */ 1187 return ret; 1188 } 1189 1190 /* 1191 * We have no kernel internal state, i.e. no waiters in the 1192 * kernel. Waiters which are about to queue themselves are stuck 1193 * on hb->lock. So we can safely ignore them. We do neither 1194 * preserve the WAITERS bit not the OWNER_DIED one. We are the 1195 * owner. 1196 */ 1197 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { 1198 spin_unlock(&hb->lock); 1199 switch (ret) { 1200 case -EFAULT: 1201 goto pi_faulted; 1202 1203 case -EAGAIN: 1204 goto pi_retry; 1205 1206 default: 1207 WARN_ON_ONCE(1); 1208 return ret; 1209 } 1210 } 1211 1212 /* 1213 * If uval has changed, let user space handle it. 1214 */ 1215 ret = (curval == uval) ? 0 : -EAGAIN; 1216 1217 out_unlock: 1218 spin_unlock(&hb->lock); 1219 return ret; 1220 1221 pi_retry: 1222 cond_resched(); 1223 goto retry; 1224 1225 pi_faulted: 1226 1227 ret = fault_in_user_writeable(uaddr); 1228 if (!ret) 1229 goto retry; 1230 1231 return ret; 1232 } 1233
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |