drm/scheduler/sched_entity.c

0001 /*
0002  * Copyright 2015 Advanced Micro Devices, Inc.
0003  *
0004  * Permission is hereby granted, free of charge, to any person obtaining a
0005  * copy of this software and associated documentation files (the "Software"),
0006  * to deal in the Software without restriction, including without limitation
0007  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
0008  * and/or sell copies of the Software, and to permit persons to whom the
0009  * Software is furnished to do so, subject to the following conditions:
0010  *
0011  * The above copyright notice and this permission notice shall be included in
0012  * all copies or substantial portions of the Software.
0013  *
0014  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
0017  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
0018  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
0019  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
0020  * OTHER DEALINGS IN THE SOFTWARE.
0021  *
0022  */
0023
0024 #include <linux/kthread.h>
0025 #include <linux/slab.h>
0026 #include <linux/completion.h>
0027
0028 #include <drm/drm_print.h>
0029 #include <drm/gpu_scheduler.h>
0030
0031 #include "gpu_scheduler_trace.h"
0032
0033 #define to_drm_sched_job(sched_job)     \
0034         container_of((sched_job), struct drm_sched_job, queue_node)
0035
0036 /**
0037  * drm_sched_entity_init - Init a context entity used by scheduler when
0038  * submit to HW ring.
0039  *
0040  * @entity: scheduler entity to init
0041  * @priority: priority of the entity
0042  * @sched_list: the list of drm scheds on which jobs from this
0043  *           entity can be submitted
0044  * @num_sched_list: number of drm sched in sched_list
0045  * @guilty: atomic_t set to 1 when a job on this queue
0046  *          is found to be guilty causing a timeout
0047  *
0048  * Note that the &sched_list must have at least one element to schedule the entity.
0049  *
0050  * For changing @priority later on at runtime see
0051  * drm_sched_entity_set_priority(). For changing the set of schedulers
0052  * @sched_list at runtime see drm_sched_entity_modify_sched().
0053  *
0054  * An entity is cleaned up by callind drm_sched_entity_fini(). See also
0055  * drm_sched_entity_destroy().
0056  *
0057  * Returns 0 on success or a negative error code on failure.
0058  */
0059 int drm_sched_entity_init(struct drm_sched_entity *entity,
0060               enum drm_sched_priority priority,
0061               struct drm_gpu_scheduler **sched_list,
0062               unsigned int num_sched_list,
0063               atomic_t *guilty)
0064 {
0065     if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])))
0066         return -EINVAL;
0067
0068     memset(entity, 0, sizeof(struct drm_sched_entity));
0069     INIT_LIST_HEAD(&entity->list);
0070     entity->rq = NULL;
0071     entity->guilty = guilty;
0072     entity->num_sched_list = num_sched_list;
0073     entity->priority = priority;
0074     entity->sched_list = num_sched_list > 1 ? sched_list : NULL;
0075     entity->last_scheduled = NULL;
0076
0077     if(num_sched_list)
0078         entity->rq = &sched_list[0]->sched_rq[entity->priority];
0079
0080     init_completion(&entity->entity_idle);
0081
0082     /* We start in an idle state. */
0083     complete(&entity->entity_idle);
0084
0085     spin_lock_init(&entity->rq_lock);
0086     spsc_queue_init(&entity->job_queue);
0087
0088     atomic_set(&entity->fence_seq, 0);
0089     entity->fence_context = dma_fence_context_alloc(2);
0090
0091     return 0;
0092 }
0093 EXPORT_SYMBOL(drm_sched_entity_init);
0094
0095 /**
0096  * drm_sched_entity_modify_sched - Modify sched of an entity
0097  * @entity: scheduler entity to init
0098  * @sched_list: the list of new drm scheds which will replace
0099  *       existing entity->sched_list
0100  * @num_sched_list: number of drm sched in sched_list
0101  *
0102  * Note that this must be called under the same common lock for @entity as
0103  * drm_sched_job_arm() and drm_sched_entity_push_job(), or the driver needs to
0104  * guarantee through some other means that this is never called while new jobs
0105  * can be pushed to @entity.
0106  */
0107 void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
0108                     struct drm_gpu_scheduler **sched_list,
0109                     unsigned int num_sched_list)
0110 {
0111     WARN_ON(!num_sched_list || !sched_list);
0112
0113     entity->sched_list = sched_list;
0114     entity->num_sched_list = num_sched_list;
0115 }
0116 EXPORT_SYMBOL(drm_sched_entity_modify_sched);
0117
0118 static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity)
0119 {
0120     rmb(); /* for list_empty to work without lock */
0121
0122     if (list_empty(&entity->list) ||
0123         spsc_queue_count(&entity->job_queue) == 0 ||
0124         entity->stopped)
0125         return true;
0126
0127     return false;
0128 }
0129
0130 /* Return true if entity could provide a job. */
0131 bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
0132 {
0133     if (spsc_queue_peek(&entity->job_queue) == NULL)
0134         return false;
0135
0136     if (READ_ONCE(entity->dependency))
0137         return false;
0138
0139     return true;
0140 }
0141
0142 /**
0143  * drm_sched_entity_flush - Flush a context entity
0144  *
0145  * @entity: scheduler entity
0146  * @timeout: time to wait in for Q to become empty in jiffies.
0147  *
0148  * Splitting drm_sched_entity_fini() into two functions, The first one does the
0149  * waiting, removes the entity from the runqueue and returns an error when the
0150  * process was killed.
0151  *
0152  * Returns the remaining time in jiffies left from the input timeout
0153  */
0154 long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
0155 {
0156     struct drm_gpu_scheduler *sched;
0157     struct task_struct *last_user;
0158     long ret = timeout;
0159
0160     if (!entity->rq)
0161         return 0;
0162
0163     sched = entity->rq->sched;
0164     /**
0165      * The client will not queue more IBs during this fini, consume existing
0166      * queued IBs or discard them on SIGKILL
0167      */
0168     if (current->flags & PF_EXITING) {
0169         if (timeout)
0170             ret = wait_event_timeout(
0171                     sched->job_scheduled,
0172                     drm_sched_entity_is_idle(entity),
0173                     timeout);
0174     } else {
0175         wait_event_killable(sched->job_scheduled,
0176                     drm_sched_entity_is_idle(entity));
0177     }
0178
0179     /* For killed process disable any more IBs enqueue right now */
0180     last_user = cmpxchg(&entity->last_user, current->group_leader, NULL);
0181     if ((!last_user || last_user == current->group_leader) &&
0182         (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) {
0183         spin_lock(&entity->rq_lock);
0184         entity->stopped = true;
0185         drm_sched_rq_remove_entity(entity->rq, entity);
0186         spin_unlock(&entity->rq_lock);
0187     }
0188
0189     return ret;
0190 }
0191 EXPORT_SYMBOL(drm_sched_entity_flush);
0192
0193 static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk)
0194 {
0195     struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
0196
0197     drm_sched_fence_finished(job->s_fence);
0198     WARN_ON(job->s_fence->parent);
0199     job->sched->ops->free_job(job);
0200 }
0201
0202
0203 /* Signal the scheduler finished fence when the entity in question is killed. */
0204 static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
0205                       struct dma_fence_cb *cb)
0206 {
0207     struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
0208                          finish_cb);
0209
0210     INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work);
0211     schedule_work(&job->work);
0212 }
0213
0214 static struct dma_fence *
0215 drm_sched_job_dependency(struct drm_sched_job *job,
0216              struct drm_sched_entity *entity)
0217 {
0218     if (!xa_empty(&job->dependencies))
0219         return xa_erase(&job->dependencies, job->last_dependency++);
0220
0221     if (job->sched->ops->dependency)
0222         return job->sched->ops->dependency(job, entity);
0223
0224     return NULL;
0225 }
0226
0227 static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
0228 {
0229     struct drm_sched_job *job;
0230     struct dma_fence *f;
0231     int r;
0232
0233     while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
0234         struct drm_sched_fence *s_fence = job->s_fence;
0235
0236         /* Wait for all dependencies to avoid data corruptions */
0237         while ((f = drm_sched_job_dependency(job, entity)))
0238             dma_fence_wait(f, false);
0239
0240         drm_sched_fence_scheduled(s_fence);
0241         dma_fence_set_error(&s_fence->finished, -ESRCH);
0242
0243         /*
0244          * When pipe is hanged by older entity, new entity might
0245          * not even have chance to submit it's first job to HW
0246          * and so entity->last_scheduled will remain NULL
0247          */
0248         if (!entity->last_scheduled) {
0249             drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
0250             continue;
0251         }
0252
0253         r = dma_fence_add_callback(entity->last_scheduled,
0254                        &job->finish_cb,
0255                        drm_sched_entity_kill_jobs_cb);
0256         if (r == -ENOENT)
0257             drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
0258         else if (r)
0259             DRM_ERROR("fence add callback failed (%d)\n", r);
0260     }
0261 }
0262
0263 /**
0264  * drm_sched_entity_fini - Destroy a context entity
0265  *
0266  * @entity: scheduler entity
0267  *
0268  * Cleanups up @entity which has been initialized by drm_sched_entity_init().
0269  *
0270  * If there are potentially job still in flight or getting newly queued
0271  * drm_sched_entity_flush() must be called first. This function then goes over
0272  * the entity and signals all jobs with an error code if the process was killed.
0273  */
0274 void drm_sched_entity_fini(struct drm_sched_entity *entity)
0275 {
0276     struct drm_gpu_scheduler *sched = NULL;
0277
0278     if (entity->rq) {
0279         sched = entity->rq->sched;
0280         drm_sched_rq_remove_entity(entity->rq, entity);
0281     }
0282
0283     /* Consumption of existing IBs wasn't completed. Forcefully
0284      * remove them here.
0285      */
0286     if (spsc_queue_count(&entity->job_queue)) {
0287         if (sched) {
0288             /*
0289              * Wait for thread to idle to make sure it isn't processing
0290              * this entity.
0291              */
0292             wait_for_completion(&entity->entity_idle);
0293
0294         }
0295         if (entity->dependency) {
0296             dma_fence_remove_callback(entity->dependency,
0297                           &entity->cb);
0298             dma_fence_put(entity->dependency);
0299             entity->dependency = NULL;
0300         }
0301
0302         drm_sched_entity_kill_jobs(entity);
0303     }
0304
0305     dma_fence_put(entity->last_scheduled);
0306     entity->last_scheduled = NULL;
0307 }
0308 EXPORT_SYMBOL(drm_sched_entity_fini);
0309
0310 /**
0311  * drm_sched_entity_destroy - Destroy a context entity
0312  * @entity: scheduler entity
0313  *
0314  * Calls drm_sched_entity_flush() and drm_sched_entity_fini() as a
0315  * convenience wrapper.
0316  */
0317 void drm_sched_entity_destroy(struct drm_sched_entity *entity)
0318 {
0319     drm_sched_entity_flush(entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY);
0320     drm_sched_entity_fini(entity);
0321 }
0322 EXPORT_SYMBOL(drm_sched_entity_destroy);
0323
0324 /* drm_sched_entity_clear_dep - callback to clear the entities dependency */
0325 static void drm_sched_entity_clear_dep(struct dma_fence *f,
0326                        struct dma_fence_cb *cb)
0327 {
0328     struct drm_sched_entity *entity =
0329         container_of(cb, struct drm_sched_entity, cb);
0330
0331     entity->dependency = NULL;
0332     dma_fence_put(f);
0333 }
0334
0335 /*
0336  * drm_sched_entity_clear_dep - callback to clear the entities dependency and
0337  * wake up scheduler
0338  */
0339 static void drm_sched_entity_wakeup(struct dma_fence *f,
0340                     struct dma_fence_cb *cb)
0341 {
0342     struct drm_sched_entity *entity =
0343         container_of(cb, struct drm_sched_entity, cb);
0344
0345     drm_sched_entity_clear_dep(f, cb);
0346     drm_sched_wakeup(entity->rq->sched);
0347 }
0348
0349 /**
0350  * drm_sched_entity_set_priority - Sets priority of the entity
0351  *
0352  * @entity: scheduler entity
0353  * @priority: scheduler priority
0354  *
0355  * Update the priority of runqueus used for the entity.
0356  */
0357 void drm_sched_entity_set_priority(struct drm_sched_entity *entity,
0358                    enum drm_sched_priority priority)
0359 {
0360     spin_lock(&entity->rq_lock);
0361     entity->priority = priority;
0362     spin_unlock(&entity->rq_lock);
0363 }
0364 EXPORT_SYMBOL(drm_sched_entity_set_priority);
0365
0366 /*
0367  * Add a callback to the current dependency of the entity to wake up the
0368  * scheduler when the entity becomes available.
0369  */
0370 static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
0371 {
0372     struct drm_gpu_scheduler *sched = entity->rq->sched;
0373     struct dma_fence *fence = entity->dependency;
0374     struct drm_sched_fence *s_fence;
0375
0376     if (fence->context == entity->fence_context ||
0377         fence->context == entity->fence_context + 1) {
0378         /*
0379          * Fence is a scheduled/finished fence from a job
0380          * which belongs to the same entity, we can ignore
0381          * fences from ourself
0382          */
0383         dma_fence_put(entity->dependency);
0384         return false;
0385     }
0386
0387     s_fence = to_drm_sched_fence(fence);
0388     if (s_fence && s_fence->sched == sched) {
0389
0390         /*
0391          * Fence is from the same scheduler, only need to wait for
0392          * it to be scheduled
0393          */
0394         fence = dma_fence_get(&s_fence->scheduled);
0395         dma_fence_put(entity->dependency);
0396         entity->dependency = fence;
0397         if (!dma_fence_add_callback(fence, &entity->cb,
0398                         drm_sched_entity_clear_dep))
0399             return true;
0400
0401         /* Ignore it when it is already scheduled */
0402         dma_fence_put(fence);
0403         return false;
0404     }
0405
0406     if (!dma_fence_add_callback(entity->dependency, &entity->cb,
0407                     drm_sched_entity_wakeup))
0408         return true;
0409
0410     dma_fence_put(entity->dependency);
0411     return false;
0412 }
0413
0414 struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
0415 {
0416     struct drm_sched_job *sched_job;
0417
0418     sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
0419     if (!sched_job)
0420         return NULL;
0421
0422     while ((entity->dependency =
0423             drm_sched_job_dependency(sched_job, entity))) {
0424         trace_drm_sched_job_wait_dep(sched_job, entity->dependency);
0425
0426         if (drm_sched_entity_add_dependency_cb(entity))
0427             return NULL;
0428     }
0429
0430     /* skip jobs from entity that marked guilty */
0431     if (entity->guilty && atomic_read(entity->guilty))
0432         dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
0433
0434     dma_fence_put(entity->last_scheduled);
0435
0436     entity->last_scheduled = dma_fence_get(&sched_job->s_fence->finished);
0437
0438     /*
0439      * If the queue is empty we allow drm_sched_entity_select_rq() to
0440      * locklessly access ->last_scheduled. This only works if we set the
0441      * pointer before we dequeue and if we a write barrier here.
0442      */
0443     smp_wmb();
0444
0445     spsc_queue_pop(&entity->job_queue);
0446     return sched_job;
0447 }
0448
0449 void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
0450 {
0451     struct dma_fence *fence;
0452     struct drm_gpu_scheduler *sched;
0453     struct drm_sched_rq *rq;
0454
0455     /* single possible engine and already selected */
0456     if (!entity->sched_list)
0457         return;
0458
0459     /* queue non-empty, stay on the same engine */
0460     if (spsc_queue_count(&entity->job_queue))
0461         return;
0462
0463     /*
0464      * Only when the queue is empty are we guaranteed that the scheduler
0465      * thread cannot change ->last_scheduled. To enforce ordering we need
0466      * a read barrier here. See drm_sched_entity_pop_job() for the other
0467      * side.
0468      */
0469     smp_rmb();
0470
0471     fence = entity->last_scheduled;
0472
0473     /* stay on the same engine if the previous job hasn't finished */
0474     if (fence && !dma_fence_is_signaled(fence))
0475         return;
0476
0477     spin_lock(&entity->rq_lock);
0478     sched = drm_sched_pick_best(entity->sched_list, entity->num_sched_list);
0479     rq = sched ? &sched->sched_rq[entity->priority] : NULL;
0480     if (rq != entity->rq) {
0481         drm_sched_rq_remove_entity(entity->rq, entity);
0482         entity->rq = rq;
0483     }
0484     spin_unlock(&entity->rq_lock);
0485
0486     if (entity->num_sched_list == 1)
0487         entity->sched_list = NULL;
0488 }
0489
0490 /**
0491  * drm_sched_entity_push_job - Submit a job to the entity's job queue
0492  * @sched_job: job to submit
0493  *
0494  * Note: To guarantee that the order of insertion to queue matches the job's
0495  * fence sequence number this function should be called with drm_sched_job_arm()
0496  * under common lock for the struct drm_sched_entity that was set up for
0497  * @sched_job in drm_sched_job_init().
0498  *
0499  * Returns 0 for success, negative error code otherwise.
0500  */
0501 void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
0502 {
0503     struct drm_sched_entity *entity = sched_job->entity;
0504     bool first;
0505
0506     trace_drm_sched_job(sched_job, entity);
0507     atomic_inc(entity->rq->sched->score);
0508     WRITE_ONCE(entity->last_user, current->group_leader);
0509     first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
0510
0511     /* first job wakes up scheduler */
0512     if (first) {
0513         /* Add the entity to the run queue */
0514         spin_lock(&entity->rq_lock);
0515         if (entity->stopped) {
0516             spin_unlock(&entity->rq_lock);
0517
0518             DRM_ERROR("Trying to push to a killed entity\n");
0519             return;
0520         }
0521         drm_sched_rq_add_entity(entity->rq, entity);
0522         spin_unlock(&entity->rq_lock);
0523         drm_sched_wakeup(entity->rq->sched);
0524     }
0525 }
0526 EXPORT_SYMBOL(drm_sched_entity_push_job);