drm/lima/lima_sched.c

0001 // SPDX-License-Identifier: GPL-2.0 OR MIT
0002 /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
0003
0004 #include <linux/iosys-map.h>
0005 #include <linux/kthread.h>
0006 #include <linux/slab.h>
0007 #include <linux/vmalloc.h>
0008 #include <linux/pm_runtime.h>
0009
0010 #include "lima_devfreq.h"
0011 #include "lima_drv.h"
0012 #include "lima_sched.h"
0013 #include "lima_vm.h"
0014 #include "lima_mmu.h"
0015 #include "lima_l2_cache.h"
0016 #include "lima_gem.h"
0017 #include "lima_trace.h"
0018
0019 struct lima_fence {
0020     struct dma_fence base;
0021     struct lima_sched_pipe *pipe;
0022 };
0023
0024 static struct kmem_cache *lima_fence_slab;
0025 static int lima_fence_slab_refcnt;
0026
0027 int lima_sched_slab_init(void)
0028 {
0029     if (!lima_fence_slab) {
0030         lima_fence_slab = kmem_cache_create(
0031             "lima_fence", sizeof(struct lima_fence), 0,
0032             SLAB_HWCACHE_ALIGN, NULL);
0033         if (!lima_fence_slab)
0034             return -ENOMEM;
0035     }
0036
0037     lima_fence_slab_refcnt++;
0038     return 0;
0039 }
0040
0041 void lima_sched_slab_fini(void)
0042 {
0043     if (!--lima_fence_slab_refcnt) {
0044         kmem_cache_destroy(lima_fence_slab);
0045         lima_fence_slab = NULL;
0046     }
0047 }
0048
0049 static inline struct lima_fence *to_lima_fence(struct dma_fence *fence)
0050 {
0051     return container_of(fence, struct lima_fence, base);
0052 }
0053
0054 static const char *lima_fence_get_driver_name(struct dma_fence *fence)
0055 {
0056     return "lima";
0057 }
0058
0059 static const char *lima_fence_get_timeline_name(struct dma_fence *fence)
0060 {
0061     struct lima_fence *f = to_lima_fence(fence);
0062
0063     return f->pipe->base.name;
0064 }
0065
0066 static void lima_fence_release_rcu(struct rcu_head *rcu)
0067 {
0068     struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
0069     struct lima_fence *fence = to_lima_fence(f);
0070
0071     kmem_cache_free(lima_fence_slab, fence);
0072 }
0073
0074 static void lima_fence_release(struct dma_fence *fence)
0075 {
0076     struct lima_fence *f = to_lima_fence(fence);
0077
0078     call_rcu(&f->base.rcu, lima_fence_release_rcu);
0079 }
0080
0081 static const struct dma_fence_ops lima_fence_ops = {
0082     .get_driver_name = lima_fence_get_driver_name,
0083     .get_timeline_name = lima_fence_get_timeline_name,
0084     .release = lima_fence_release,
0085 };
0086
0087 static struct lima_fence *lima_fence_create(struct lima_sched_pipe *pipe)
0088 {
0089     struct lima_fence *fence;
0090
0091     fence = kmem_cache_zalloc(lima_fence_slab, GFP_KERNEL);
0092     if (!fence)
0093         return NULL;
0094
0095     fence->pipe = pipe;
0096     dma_fence_init(&fence->base, &lima_fence_ops, &pipe->fence_lock,
0097                pipe->fence_context, ++pipe->fence_seqno);
0098
0099     return fence;
0100 }
0101
0102 static inline struct lima_sched_task *to_lima_task(struct drm_sched_job *job)
0103 {
0104     return container_of(job, struct lima_sched_task, base);
0105 }
0106
0107 static inline struct lima_sched_pipe *to_lima_pipe(struct drm_gpu_scheduler *sched)
0108 {
0109     return container_of(sched, struct lima_sched_pipe, base);
0110 }
0111
0112 int lima_sched_task_init(struct lima_sched_task *task,
0113              struct lima_sched_context *context,
0114              struct lima_bo **bos, int num_bos,
0115              struct lima_vm *vm)
0116 {
0117     int err, i;
0118
0119     task->bos = kmemdup(bos, sizeof(*bos) * num_bos, GFP_KERNEL);
0120     if (!task->bos)
0121         return -ENOMEM;
0122
0123     for (i = 0; i < num_bos; i++)
0124         drm_gem_object_get(&bos[i]->base.base);
0125
0126     err = drm_sched_job_init(&task->base, &context->base, vm);
0127     if (err) {
0128         kfree(task->bos);
0129         return err;
0130     }
0131
0132     drm_sched_job_arm(&task->base);
0133
0134     task->num_bos = num_bos;
0135     task->vm = lima_vm_get(vm);
0136
0137     return 0;
0138 }
0139
0140 void lima_sched_task_fini(struct lima_sched_task *task)
0141 {
0142     int i;
0143
0144     drm_sched_job_cleanup(&task->base);
0145
0146     if (task->bos) {
0147         for (i = 0; i < task->num_bos; i++)
0148             drm_gem_object_put(&task->bos[i]->base.base);
0149         kfree(task->bos);
0150     }
0151
0152     lima_vm_put(task->vm);
0153 }
0154
0155 int lima_sched_context_init(struct lima_sched_pipe *pipe,
0156                 struct lima_sched_context *context,
0157                 atomic_t *guilty)
0158 {
0159     struct drm_gpu_scheduler *sched = &pipe->base;
0160
0161     return drm_sched_entity_init(&context->base, DRM_SCHED_PRIORITY_NORMAL,
0162                      &sched, 1, guilty);
0163 }
0164
0165 void lima_sched_context_fini(struct lima_sched_pipe *pipe,
0166                  struct lima_sched_context *context)
0167 {
0168     drm_sched_entity_fini(&context->base);
0169 }
0170
0171 struct dma_fence *lima_sched_context_queue_task(struct lima_sched_task *task)
0172 {
0173     struct dma_fence *fence = dma_fence_get(&task->base.s_fence->finished);
0174
0175     trace_lima_task_submit(task);
0176     drm_sched_entity_push_job(&task->base);
0177     return fence;
0178 }
0179
0180 static int lima_pm_busy(struct lima_device *ldev)
0181 {
0182     int ret;
0183
0184     /* resume GPU if it has been suspended by runtime PM */
0185     ret = pm_runtime_resume_and_get(ldev->dev);
0186     if (ret < 0)
0187         return ret;
0188
0189     lima_devfreq_record_busy(&ldev->devfreq);
0190     return 0;
0191 }
0192
0193 static void lima_pm_idle(struct lima_device *ldev)
0194 {
0195     lima_devfreq_record_idle(&ldev->devfreq);
0196
0197     /* GPU can do auto runtime suspend */
0198     pm_runtime_mark_last_busy(ldev->dev);
0199     pm_runtime_put_autosuspend(ldev->dev);
0200 }
0201
0202 static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
0203 {
0204     struct lima_sched_task *task = to_lima_task(job);
0205     struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
0206     struct lima_device *ldev = pipe->ldev;
0207     struct lima_fence *fence;
0208     int i, err;
0209
0210     /* after GPU reset */
0211     if (job->s_fence->finished.error < 0)
0212         return NULL;
0213
0214     fence = lima_fence_create(pipe);
0215     if (!fence)
0216         return NULL;
0217
0218     err = lima_pm_busy(ldev);
0219     if (err < 0) {
0220         dma_fence_put(&fence->base);
0221         return NULL;
0222     }
0223
0224     task->fence = &fence->base;
0225
0226     /* for caller usage of the fence, otherwise irq handler
0227      * may consume the fence before caller use it
0228      */
0229     dma_fence_get(task->fence);
0230
0231     pipe->current_task = task;
0232
0233     /* this is needed for MMU to work correctly, otherwise GP/PP
0234      * will hang or page fault for unknown reason after running for
0235      * a while.
0236      *
0237      * Need to investigate:
0238      * 1. is it related to TLB
0239      * 2. how much performance will be affected by L2 cache flush
0240      * 3. can we reduce the calling of this function because all
0241      *    GP/PP use the same L2 cache on mali400
0242      *
0243      * TODO:
0244      * 1. move this to task fini to save some wait time?
0245      * 2. when GP/PP use different l2 cache, need PP wait GP l2
0246      *    cache flush?
0247      */
0248     for (i = 0; i < pipe->num_l2_cache; i++)
0249         lima_l2_cache_flush(pipe->l2_cache[i]);
0250
0251     lima_vm_put(pipe->current_vm);
0252     pipe->current_vm = lima_vm_get(task->vm);
0253
0254     if (pipe->bcast_mmu)
0255         lima_mmu_switch_vm(pipe->bcast_mmu, pipe->current_vm);
0256     else {
0257         for (i = 0; i < pipe->num_mmu; i++)
0258             lima_mmu_switch_vm(pipe->mmu[i], pipe->current_vm);
0259     }
0260
0261     trace_lima_task_run(task);
0262
0263     pipe->error = false;
0264     pipe->task_run(pipe, task);
0265
0266     return task->fence;
0267 }
0268
0269 static void lima_sched_build_error_task_list(struct lima_sched_task *task)
0270 {
0271     struct lima_sched_error_task *et;
0272     struct lima_sched_pipe *pipe = to_lima_pipe(task->base.sched);
0273     struct lima_ip *ip = pipe->processor[0];
0274     int pipe_id = ip->id == lima_ip_gp ? lima_pipe_gp : lima_pipe_pp;
0275     struct lima_device *dev = ip->dev;
0276     struct lima_sched_context *sched_ctx =
0277         container_of(task->base.entity,
0278                  struct lima_sched_context, base);
0279     struct lima_ctx *ctx =
0280         container_of(sched_ctx, struct lima_ctx, context[pipe_id]);
0281     struct lima_dump_task *dt;
0282     struct lima_dump_chunk *chunk;
0283     struct lima_dump_chunk_pid *pid_chunk;
0284     struct lima_dump_chunk_buffer *buffer_chunk;
0285     u32 size, task_size, mem_size;
0286     int i;
0287     struct iosys_map map;
0288     int ret;
0289
0290     mutex_lock(&dev->error_task_list_lock);
0291
0292     if (dev->dump.num_tasks >= lima_max_error_tasks) {
0293         dev_info(dev->dev, "fail to save task state from %s pid %d: "
0294              "error task list is full\n", ctx->pname, ctx->pid);
0295         goto out;
0296     }
0297
0298     /* frame chunk */
0299     size = sizeof(struct lima_dump_chunk) + pipe->frame_size;
0300     /* process name chunk */
0301     size += sizeof(struct lima_dump_chunk) + sizeof(ctx->pname);
0302     /* pid chunk */
0303     size += sizeof(struct lima_dump_chunk);
0304     /* buffer chunks */
0305     for (i = 0; i < task->num_bos; i++) {
0306         struct lima_bo *bo = task->bos[i];
0307
0308         size += sizeof(struct lima_dump_chunk);
0309         size += bo->heap_size ? bo->heap_size : lima_bo_size(bo);
0310     }
0311
0312     task_size = size + sizeof(struct lima_dump_task);
0313     mem_size = task_size + sizeof(*et);
0314     et = kvmalloc(mem_size, GFP_KERNEL);
0315     if (!et) {
0316         dev_err(dev->dev, "fail to alloc task dump buffer of size %x\n",
0317             mem_size);
0318         goto out;
0319     }
0320
0321     et->data = et + 1;
0322     et->size = task_size;
0323
0324     dt = et->data;
0325     memset(dt, 0, sizeof(*dt));
0326     dt->id = pipe_id;
0327     dt->size = size;
0328
0329     chunk = (struct lima_dump_chunk *)(dt + 1);
0330     memset(chunk, 0, sizeof(*chunk));
0331     chunk->id = LIMA_DUMP_CHUNK_FRAME;
0332     chunk->size = pipe->frame_size;
0333     memcpy(chunk + 1, task->frame, pipe->frame_size);
0334     dt->num_chunks++;
0335
0336     chunk = (void *)(chunk + 1) + chunk->size;
0337     memset(chunk, 0, sizeof(*chunk));
0338     chunk->id = LIMA_DUMP_CHUNK_PROCESS_NAME;
0339     chunk->size = sizeof(ctx->pname);
0340     memcpy(chunk + 1, ctx->pname, sizeof(ctx->pname));
0341     dt->num_chunks++;
0342
0343     pid_chunk = (void *)(chunk + 1) + chunk->size;
0344     memset(pid_chunk, 0, sizeof(*pid_chunk));
0345     pid_chunk->id = LIMA_DUMP_CHUNK_PROCESS_ID;
0346     pid_chunk->pid = ctx->pid;
0347     dt->num_chunks++;
0348
0349     buffer_chunk = (void *)(pid_chunk + 1) + pid_chunk->size;
0350     for (i = 0; i < task->num_bos; i++) {
0351         struct lima_bo *bo = task->bos[i];
0352         void *data;
0353
0354         memset(buffer_chunk, 0, sizeof(*buffer_chunk));
0355         buffer_chunk->id = LIMA_DUMP_CHUNK_BUFFER;
0356         buffer_chunk->va = lima_vm_get_va(task->vm, bo);
0357
0358         if (bo->heap_size) {
0359             buffer_chunk->size = bo->heap_size;
0360
0361             data = vmap(bo->base.pages, bo->heap_size >> PAGE_SHIFT,
0362                     VM_MAP, pgprot_writecombine(PAGE_KERNEL));
0363             if (!data) {
0364                 kvfree(et);
0365                 goto out;
0366             }
0367
0368             memcpy(buffer_chunk + 1, data, buffer_chunk->size);
0369
0370             vunmap(data);
0371         } else {
0372             buffer_chunk->size = lima_bo_size(bo);
0373
0374             ret = drm_gem_shmem_vmap(&bo->base, &map);
0375             if (ret) {
0376                 kvfree(et);
0377                 goto out;
0378             }
0379
0380             memcpy(buffer_chunk + 1, map.vaddr, buffer_chunk->size);
0381
0382             drm_gem_shmem_vunmap(&bo->base, &map);
0383         }
0384
0385         buffer_chunk = (void *)(buffer_chunk + 1) + buffer_chunk->size;
0386         dt->num_chunks++;
0387     }
0388
0389     list_add(&et->list, &dev->error_task_list);
0390     dev->dump.size += et->size;
0391     dev->dump.num_tasks++;
0392
0393     dev_info(dev->dev, "save error task state success\n");
0394
0395 out:
0396     mutex_unlock(&dev->error_task_list_lock);
0397 }
0398
0399 static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job)
0400 {
0401     struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
0402     struct lima_sched_task *task = to_lima_task(job);
0403     struct lima_device *ldev = pipe->ldev;
0404
0405     if (!pipe->error)
0406         DRM_ERROR("lima job timeout\n");
0407
0408     drm_sched_stop(&pipe->base, &task->base);
0409
0410     drm_sched_increase_karma(&task->base);
0411
0412     if (lima_max_error_tasks)
0413         lima_sched_build_error_task_list(task);
0414
0415     pipe->task_error(pipe);
0416
0417     if (pipe->bcast_mmu)
0418         lima_mmu_page_fault_resume(pipe->bcast_mmu);
0419     else {
0420         int i;
0421
0422         for (i = 0; i < pipe->num_mmu; i++)
0423             lima_mmu_page_fault_resume(pipe->mmu[i]);
0424     }
0425
0426     lima_vm_put(pipe->current_vm);
0427     pipe->current_vm = NULL;
0428     pipe->current_task = NULL;
0429
0430     lima_pm_idle(ldev);
0431
0432     drm_sched_resubmit_jobs(&pipe->base);
0433     drm_sched_start(&pipe->base, true);
0434
0435     return DRM_GPU_SCHED_STAT_NOMINAL;
0436 }
0437
0438 static void lima_sched_free_job(struct drm_sched_job *job)
0439 {
0440     struct lima_sched_task *task = to_lima_task(job);
0441     struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
0442     struct lima_vm *vm = task->vm;
0443     struct lima_bo **bos = task->bos;
0444     int i;
0445
0446     dma_fence_put(task->fence);
0447
0448     for (i = 0; i < task->num_bos; i++)
0449         lima_vm_bo_del(vm, bos[i]);
0450
0451     lima_sched_task_fini(task);
0452     kmem_cache_free(pipe->task_slab, task);
0453 }
0454
0455 static const struct drm_sched_backend_ops lima_sched_ops = {
0456     .run_job = lima_sched_run_job,
0457     .timedout_job = lima_sched_timedout_job,
0458     .free_job = lima_sched_free_job,
0459 };
0460
0461 static void lima_sched_recover_work(struct work_struct *work)
0462 {
0463     struct lima_sched_pipe *pipe =
0464         container_of(work, struct lima_sched_pipe, recover_work);
0465     int i;
0466
0467     for (i = 0; i < pipe->num_l2_cache; i++)
0468         lima_l2_cache_flush(pipe->l2_cache[i]);
0469
0470     if (pipe->bcast_mmu) {
0471         lima_mmu_flush_tlb(pipe->bcast_mmu);
0472     } else {
0473         for (i = 0; i < pipe->num_mmu; i++)
0474             lima_mmu_flush_tlb(pipe->mmu[i]);
0475     }
0476
0477     if (pipe->task_recover(pipe))
0478         drm_sched_fault(&pipe->base);
0479 }
0480
0481 int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name)
0482 {
0483     unsigned int timeout = lima_sched_timeout_ms > 0 ?
0484                    lima_sched_timeout_ms : 500;
0485
0486     pipe->fence_context = dma_fence_context_alloc(1);
0487     spin_lock_init(&pipe->fence_lock);
0488
0489     INIT_WORK(&pipe->recover_work, lima_sched_recover_work);
0490
0491     return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
0492                   lima_job_hang_limit,
0493                   msecs_to_jiffies(timeout), NULL,
0494                   NULL, name, pipe->ldev->dev);
0495 }
0496
0497 void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
0498 {
0499     drm_sched_fini(&pipe->base);
0500 }
0501
0502 void lima_sched_pipe_task_done(struct lima_sched_pipe *pipe)
0503 {
0504     struct lima_sched_task *task = pipe->current_task;
0505     struct lima_device *ldev = pipe->ldev;
0506
0507     if (pipe->error) {
0508         if (task && task->recoverable)
0509             schedule_work(&pipe->recover_work);
0510         else
0511             drm_sched_fault(&pipe->base);
0512     } else {
0513         pipe->task_fini(pipe);
0514         dma_fence_signal(task->fence);
0515
0516         lima_pm_idle(ldev);
0517     }
0518 }