drm/v3d/v3d_sched.c

0001 // SPDX-License-Identifier: GPL-2.0+
0002 /* Copyright (C) 2018 Broadcom */
0003
0004 /**
0005  * DOC: Broadcom V3D scheduling
0006  *
0007  * The shared DRM GPU scheduler is used to coordinate submitting jobs
0008  * to the hardware.  Each DRM fd (roughly a client process) gets its
0009  * own scheduler entity, which will process jobs in order.  The GPU
0010  * scheduler will round-robin between clients to submit the next job.
0011  *
0012  * For simplicity, and in order to keep latency low for interactive
0013  * jobs when bulk background jobs are queued up, we submit a new job
0014  * to the HW only when it has completed the last one, instead of
0015  * filling up the CT[01]Q FIFOs with jobs.  Similarly, we use
0016  * drm_sched_job_add_dependency() to manage the dependency between bin and
0017  * render, instead of having the clients submit jobs using the HW's
0018  * semaphores to interlock between them.
0019  */
0020
0021 #include <linux/kthread.h>
0022
0023 #include "v3d_drv.h"
0024 #include "v3d_regs.h"
0025 #include "v3d_trace.h"
0026
0027 static struct v3d_job *
0028 to_v3d_job(struct drm_sched_job *sched_job)
0029 {
0030     return container_of(sched_job, struct v3d_job, base);
0031 }
0032
0033 static struct v3d_bin_job *
0034 to_bin_job(struct drm_sched_job *sched_job)
0035 {
0036     return container_of(sched_job, struct v3d_bin_job, base.base);
0037 }
0038
0039 static struct v3d_render_job *
0040 to_render_job(struct drm_sched_job *sched_job)
0041 {
0042     return container_of(sched_job, struct v3d_render_job, base.base);
0043 }
0044
0045 static struct v3d_tfu_job *
0046 to_tfu_job(struct drm_sched_job *sched_job)
0047 {
0048     return container_of(sched_job, struct v3d_tfu_job, base.base);
0049 }
0050
0051 static struct v3d_csd_job *
0052 to_csd_job(struct drm_sched_job *sched_job)
0053 {
0054     return container_of(sched_job, struct v3d_csd_job, base.base);
0055 }
0056
0057 static void
0058 v3d_sched_job_free(struct drm_sched_job *sched_job)
0059 {
0060     struct v3d_job *job = to_v3d_job(sched_job);
0061
0062     v3d_job_cleanup(job);
0063 }
0064
0065 static void
0066 v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
0067 {
0068     if (job->perfmon != v3d->active_perfmon)
0069         v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
0070
0071     if (job->perfmon && v3d->active_perfmon != job->perfmon)
0072         v3d_perfmon_start(v3d, job->perfmon);
0073 }
0074
0075 static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
0076 {
0077     struct v3d_bin_job *job = to_bin_job(sched_job);
0078     struct v3d_dev *v3d = job->base.v3d;
0079     struct drm_device *dev = &v3d->drm;
0080     struct dma_fence *fence;
0081     unsigned long irqflags;
0082
0083     if (unlikely(job->base.base.s_fence->finished.error))
0084         return NULL;
0085
0086     /* Lock required around bin_job update vs
0087      * v3d_overflow_mem_work().
0088      */
0089     spin_lock_irqsave(&v3d->job_lock, irqflags);
0090     v3d->bin_job = job;
0091     /* Clear out the overflow allocation, so we don't
0092      * reuse the overflow attached to a previous job.
0093      */
0094     V3D_CORE_WRITE(0, V3D_PTB_BPOS, 0);
0095     spin_unlock_irqrestore(&v3d->job_lock, irqflags);
0096
0097     v3d_invalidate_caches(v3d);
0098
0099     fence = v3d_fence_create(v3d, V3D_BIN);
0100     if (IS_ERR(fence))
0101         return NULL;
0102
0103     if (job->base.irq_fence)
0104         dma_fence_put(job->base.irq_fence);
0105     job->base.irq_fence = dma_fence_get(fence);
0106
0107     trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
0108                 job->start, job->end);
0109
0110     v3d_switch_perfmon(v3d, &job->base);
0111
0112     /* Set the current and end address of the control list.
0113      * Writing the end register is what starts the job.
0114      */
0115     if (job->qma) {
0116         V3D_CORE_WRITE(0, V3D_CLE_CT0QMA, job->qma);
0117         V3D_CORE_WRITE(0, V3D_CLE_CT0QMS, job->qms);
0118     }
0119     if (job->qts) {
0120         V3D_CORE_WRITE(0, V3D_CLE_CT0QTS,
0121                    V3D_CLE_CT0QTS_ENABLE |
0122                    job->qts);
0123     }
0124     V3D_CORE_WRITE(0, V3D_CLE_CT0QBA, job->start);
0125     V3D_CORE_WRITE(0, V3D_CLE_CT0QEA, job->end);
0126
0127     return fence;
0128 }
0129
0130 static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)
0131 {
0132     struct v3d_render_job *job = to_render_job(sched_job);
0133     struct v3d_dev *v3d = job->base.v3d;
0134     struct drm_device *dev = &v3d->drm;
0135     struct dma_fence *fence;
0136
0137     if (unlikely(job->base.base.s_fence->finished.error))
0138         return NULL;
0139
0140     v3d->render_job = job;
0141
0142     /* Can we avoid this flush?  We need to be careful of
0143      * scheduling, though -- imagine job0 rendering to texture and
0144      * job1 reading, and them being executed as bin0, bin1,
0145      * render0, render1, so that render1's flush at bin time
0146      * wasn't enough.
0147      */
0148     v3d_invalidate_caches(v3d);
0149
0150     fence = v3d_fence_create(v3d, V3D_RENDER);
0151     if (IS_ERR(fence))
0152         return NULL;
0153
0154     if (job->base.irq_fence)
0155         dma_fence_put(job->base.irq_fence);
0156     job->base.irq_fence = dma_fence_get(fence);
0157
0158     trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
0159                 job->start, job->end);
0160
0161     v3d_switch_perfmon(v3d, &job->base);
0162
0163     /* XXX: Set the QCFG */
0164
0165     /* Set the current and end address of the control list.
0166      * Writing the end register is what starts the job.
0167      */
0168     V3D_CORE_WRITE(0, V3D_CLE_CT1QBA, job->start);
0169     V3D_CORE_WRITE(0, V3D_CLE_CT1QEA, job->end);
0170
0171     return fence;
0172 }
0173
0174 static struct dma_fence *
0175 v3d_tfu_job_run(struct drm_sched_job *sched_job)
0176 {
0177     struct v3d_tfu_job *job = to_tfu_job(sched_job);
0178     struct v3d_dev *v3d = job->base.v3d;
0179     struct drm_device *dev = &v3d->drm;
0180     struct dma_fence *fence;
0181
0182     fence = v3d_fence_create(v3d, V3D_TFU);
0183     if (IS_ERR(fence))
0184         return NULL;
0185
0186     v3d->tfu_job = job;
0187     if (job->base.irq_fence)
0188         dma_fence_put(job->base.irq_fence);
0189     job->base.irq_fence = dma_fence_get(fence);
0190
0191     trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
0192
0193     V3D_WRITE(V3D_TFU_IIA, job->args.iia);
0194     V3D_WRITE(V3D_TFU_IIS, job->args.iis);
0195     V3D_WRITE(V3D_TFU_ICA, job->args.ica);
0196     V3D_WRITE(V3D_TFU_IUA, job->args.iua);
0197     V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
0198     V3D_WRITE(V3D_TFU_IOS, job->args.ios);
0199     V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
0200     if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
0201         V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
0202         V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
0203         V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
0204     }
0205     /* ICFG kicks off the job. */
0206     V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
0207
0208     return fence;
0209 }
0210
0211 static struct dma_fence *
0212 v3d_csd_job_run(struct drm_sched_job *sched_job)
0213 {
0214     struct v3d_csd_job *job = to_csd_job(sched_job);
0215     struct v3d_dev *v3d = job->base.v3d;
0216     struct drm_device *dev = &v3d->drm;
0217     struct dma_fence *fence;
0218     int i;
0219
0220     v3d->csd_job = job;
0221
0222     v3d_invalidate_caches(v3d);
0223
0224     fence = v3d_fence_create(v3d, V3D_CSD);
0225     if (IS_ERR(fence))
0226         return NULL;
0227
0228     if (job->base.irq_fence)
0229         dma_fence_put(job->base.irq_fence);
0230     job->base.irq_fence = dma_fence_get(fence);
0231
0232     trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
0233
0234     v3d_switch_perfmon(v3d, &job->base);
0235
0236     for (i = 1; i <= 6; i++)
0237         V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
0238     /* CFG0 write kicks off the job. */
0239     V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]);
0240
0241     return fence;
0242 }
0243
0244 static struct dma_fence *
0245 v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
0246 {
0247     struct v3d_job *job = to_v3d_job(sched_job);
0248     struct v3d_dev *v3d = job->v3d;
0249
0250     v3d_clean_caches(v3d);
0251
0252     return NULL;
0253 }
0254
0255 static enum drm_gpu_sched_stat
0256 v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
0257 {
0258     enum v3d_queue q;
0259
0260     mutex_lock(&v3d->reset_lock);
0261
0262     /* block scheduler */
0263     for (q = 0; q < V3D_MAX_QUEUES; q++)
0264         drm_sched_stop(&v3d->queue[q].sched, sched_job);
0265
0266     if (sched_job)
0267         drm_sched_increase_karma(sched_job);
0268
0269     /* get the GPU back into the init state */
0270     v3d_reset(v3d);
0271
0272     for (q = 0; q < V3D_MAX_QUEUES; q++)
0273         drm_sched_resubmit_jobs(&v3d->queue[q].sched);
0274
0275     /* Unblock schedulers and restart their jobs. */
0276     for (q = 0; q < V3D_MAX_QUEUES; q++) {
0277         drm_sched_start(&v3d->queue[q].sched, true);
0278     }
0279
0280     mutex_unlock(&v3d->reset_lock);
0281
0282     return DRM_GPU_SCHED_STAT_NOMINAL;
0283 }
0284
0285 /* If the current address or return address have changed, then the GPU
0286  * has probably made progress and we should delay the reset.  This
0287  * could fail if the GPU got in an infinite loop in the CL, but that
0288  * is pretty unlikely outside of an i-g-t testcase.
0289  */
0290 static enum drm_gpu_sched_stat
0291 v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
0292             u32 *timedout_ctca, u32 *timedout_ctra)
0293 {
0294     struct v3d_job *job = to_v3d_job(sched_job);
0295     struct v3d_dev *v3d = job->v3d;
0296     u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q));
0297     u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
0298
0299     if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
0300         *timedout_ctca = ctca;
0301         *timedout_ctra = ctra;
0302         return DRM_GPU_SCHED_STAT_NOMINAL;
0303     }
0304
0305     return v3d_gpu_reset_for_timeout(v3d, sched_job);
0306 }
0307
0308 static enum drm_gpu_sched_stat
0309 v3d_bin_job_timedout(struct drm_sched_job *sched_job)
0310 {
0311     struct v3d_bin_job *job = to_bin_job(sched_job);
0312
0313     return v3d_cl_job_timedout(sched_job, V3D_BIN,
0314                    &job->timedout_ctca, &job->timedout_ctra);
0315 }
0316
0317 static enum drm_gpu_sched_stat
0318 v3d_render_job_timedout(struct drm_sched_job *sched_job)
0319 {
0320     struct v3d_render_job *job = to_render_job(sched_job);
0321
0322     return v3d_cl_job_timedout(sched_job, V3D_RENDER,
0323                    &job->timedout_ctca, &job->timedout_ctra);
0324 }
0325
0326 static enum drm_gpu_sched_stat
0327 v3d_generic_job_timedout(struct drm_sched_job *sched_job)
0328 {
0329     struct v3d_job *job = to_v3d_job(sched_job);
0330
0331     return v3d_gpu_reset_for_timeout(job->v3d, sched_job);
0332 }
0333
0334 static enum drm_gpu_sched_stat
0335 v3d_csd_job_timedout(struct drm_sched_job *sched_job)
0336 {
0337     struct v3d_csd_job *job = to_csd_job(sched_job);
0338     struct v3d_dev *v3d = job->base.v3d;
0339     u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4);
0340
0341     /* If we've made progress, skip reset and let the timer get
0342      * rearmed.
0343      */
0344     if (job->timedout_batches != batches) {
0345         job->timedout_batches = batches;
0346         return DRM_GPU_SCHED_STAT_NOMINAL;
0347     }
0348
0349     return v3d_gpu_reset_for_timeout(v3d, sched_job);
0350 }
0351
0352 static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
0353     .run_job = v3d_bin_job_run,
0354     .timedout_job = v3d_bin_job_timedout,
0355     .free_job = v3d_sched_job_free,
0356 };
0357
0358 static const struct drm_sched_backend_ops v3d_render_sched_ops = {
0359     .run_job = v3d_render_job_run,
0360     .timedout_job = v3d_render_job_timedout,
0361     .free_job = v3d_sched_job_free,
0362 };
0363
0364 static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
0365     .run_job = v3d_tfu_job_run,
0366     .timedout_job = v3d_generic_job_timedout,
0367     .free_job = v3d_sched_job_free,
0368 };
0369
0370 static const struct drm_sched_backend_ops v3d_csd_sched_ops = {
0371     .run_job = v3d_csd_job_run,
0372     .timedout_job = v3d_csd_job_timedout,
0373     .free_job = v3d_sched_job_free
0374 };
0375
0376 static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {
0377     .run_job = v3d_cache_clean_job_run,
0378     .timedout_job = v3d_generic_job_timedout,
0379     .free_job = v3d_sched_job_free
0380 };
0381
0382 int
0383 v3d_sched_init(struct v3d_dev *v3d)
0384 {
0385     int hw_jobs_limit = 1;
0386     int job_hang_limit = 0;
0387     int hang_limit_ms = 500;
0388     int ret;
0389
0390     ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
0391                  &v3d_bin_sched_ops,
0392                  hw_jobs_limit, job_hang_limit,
0393                  msecs_to_jiffies(hang_limit_ms), NULL,
0394                  NULL, "v3d_bin", v3d->drm.dev);
0395     if (ret)
0396         return ret;
0397
0398     ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
0399                  &v3d_render_sched_ops,
0400                  hw_jobs_limit, job_hang_limit,
0401                  msecs_to_jiffies(hang_limit_ms), NULL,
0402                  NULL, "v3d_render", v3d->drm.dev);
0403     if (ret)
0404         goto fail;
0405
0406     ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
0407                  &v3d_tfu_sched_ops,
0408                  hw_jobs_limit, job_hang_limit,
0409                  msecs_to_jiffies(hang_limit_ms), NULL,
0410                  NULL, "v3d_tfu", v3d->drm.dev);
0411     if (ret)
0412         goto fail;
0413
0414     if (v3d_has_csd(v3d)) {
0415         ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
0416                      &v3d_csd_sched_ops,
0417                      hw_jobs_limit, job_hang_limit,
0418                      msecs_to_jiffies(hang_limit_ms), NULL,
0419                      NULL, "v3d_csd", v3d->drm.dev);
0420         if (ret)
0421             goto fail;
0422
0423         ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
0424                      &v3d_cache_clean_sched_ops,
0425                      hw_jobs_limit, job_hang_limit,
0426                      msecs_to_jiffies(hang_limit_ms), NULL,
0427                      NULL, "v3d_cache_clean", v3d->drm.dev);
0428         if (ret)
0429             goto fail;
0430     }
0431
0432     return 0;
0433
0434 fail:
0435     v3d_sched_fini(v3d);
0436     return ret;
0437 }
0438
0439 void
0440 v3d_sched_fini(struct v3d_dev *v3d)
0441 {
0442     enum v3d_queue q;
0443
0444     for (q = 0; q < V3D_MAX_QUEUES; q++) {
0445         if (v3d->queue[q].sched.ready)
0446             drm_sched_fini(&v3d->queue[q].sched);
0447     }
0448 }