1 From 60c65dc612663be7136a19a117cee5d194530600 Mon Sep 17 00:00:00 2001
2 From: Eric Anholt <eric@anholt.net>
3 Date: Wed, 28 Nov 2018 15:09:25 -0800
4 Subject: [PATCH 570/703] drm/v3d: Add support for submitting jobs to the TFU.
6 The TFU can copy from raster, UIF, and SAND input images to UIF output
7 images, with optional mipmap generation. This will certainly be
8 useful for media EGL image input, but is also useful immediately for
9 mipmap generation without bogging the V3D core down.
11 For now we only run the queue 1 job deep, and don't have any hang
12 recovery (though I don't think we should need it, with TFU). Queuing
13 multiple jobs in the HW will require synchronizing the YUV coefficient
14 regs updates since they don't get FIFOed with the job.
16 v2: Change the ioctl to IOW instead of IOWR, always set COEF0, explain
17 why TFU is AUTH, clarify the syncing docs, drop the unused TFU
18 interrupt regs (you're expected to use the hub's), don't take
19 &bo->base for NULL bos.
20 v3: Fix a little whitespace alignment (noticed by checkpatch), rebase
21 on drm_sched_job_cleanup() changes.
23 Signed-off-by: Eric Anholt <eric@anholt.net>
24 Reviewed-by: Dave Emett <david.emett@broadcom.com> (v2)
25 Link: https://patchwork.freedesktop.org/patch/264607/
26 (cherry picked from commit 1584f16ca96ef124aad79efa3303cff5f3530e2c)
28 drivers/gpu/drm/v3d/v3d_drv.c | 15 ++-
29 drivers/gpu/drm/v3d/v3d_drv.h | 32 +++++-
30 drivers/gpu/drm/v3d/v3d_gem.c | 178 ++++++++++++++++++++++++++++----
31 drivers/gpu/drm/v3d/v3d_irq.c | 12 ++-
32 drivers/gpu/drm/v3d/v3d_regs.h | 49 +++++++++
33 drivers/gpu/drm/v3d/v3d_sched.c | 148 ++++++++++++++++++++++----
34 drivers/gpu/drm/v3d/v3d_trace.h | 20 ++++
35 include/uapi/drm/v3d_drm.h | 25 +++++
36 8 files changed, 426 insertions(+), 53 deletions(-)
38 --- a/drivers/gpu/drm/v3d/v3d_drv.c
39 +++ b/drivers/gpu/drm/v3d/v3d_drv.c
40 @@ -112,10 +112,15 @@ static int v3d_get_param_ioctl(struct dr
44 - /* Any params that aren't just register reads would go here. */
46 - DRM_DEBUG("Unknown parameter %d\n", args->param);
48 + switch (args->param) {
49 + case DRM_V3D_PARAM_SUPPORTS_TFU:
53 + DRM_DEBUG("Unknown parameter %d\n", args->param);
59 @@ -170,7 +175,8 @@ static const struct file_operations v3d_
60 /* DRM_AUTH is required on SUBMIT_CL for now, while we don't have GMP
61 * protection between clients. Note that render nodes would be be
62 * able to submit CLs that could access BOs from clients authenticated
63 - * with the master node.
64 + * with the master node. The TFU doesn't use the GMP, so it would
65 + * need to stay DRM_AUTH until we do buffer size/offset validation.
67 static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
68 DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CL, v3d_submit_cl_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
69 @@ -179,6 +185,7 @@ static const struct drm_ioctl_desc v3d_d
70 DRM_IOCTL_DEF_DRV(V3D_MMAP_BO, v3d_mmap_bo_ioctl, DRM_RENDER_ALLOW),
71 DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
72 DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
73 + DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
76 static const struct vm_operations_struct v3d_vm_ops = {
77 --- a/drivers/gpu/drm/v3d/v3d_drv.h
78 +++ b/drivers/gpu/drm/v3d/v3d_drv.h
80 #include <drm/drm_encoder.h>
81 #include <drm/drm_gem.h>
82 #include <drm/gpu_scheduler.h>
83 +#include "uapi/drm/v3d_drm.h"
85 #define GMP_GRANULARITY (128 * 1024)
87 -/* Enum for each of the V3D queues. We maintain various queue
88 - * tracking as an array because at some point we'll want to support
89 - * the TFU (texture formatting unit) as another queue.
91 +/* Enum for each of the V3D queues. */
98 -#define V3D_MAX_QUEUES (V3D_RENDER + 1)
99 +#define V3D_MAX_QUEUES (V3D_TFU + 1)
101 struct v3d_queue_state {
102 struct drm_gpu_scheduler sched;
103 @@ -68,6 +67,7 @@ struct v3d_dev {
105 struct v3d_exec_info *bin_job;
106 struct v3d_exec_info *render_job;
107 + struct v3d_tfu_job *tfu_job;
109 struct v3d_queue_state queue[V3D_MAX_QUEUES];
111 @@ -218,6 +218,25 @@ struct v3d_exec_info {
115 +struct v3d_tfu_job {
116 + struct drm_sched_job base;
118 + struct drm_v3d_submit_tfu args;
120 + /* An optional fence userspace can pass in for the job to depend on. */
121 + struct dma_fence *in_fence;
123 + /* v3d fence to be signaled by IRQ handler when the job is complete. */
124 + struct dma_fence *done_fence;
126 + struct v3d_dev *v3d;
128 + struct kref refcount;
130 + /* This is the array of BOs that were looked up at the start of exec. */
131 + struct v3d_bo *bo[4];
135 * _wait_for - magic (register) wait macro
137 @@ -281,9 +300,12 @@ int v3d_gem_init(struct drm_device *dev)
138 void v3d_gem_destroy(struct drm_device *dev);
139 int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
140 struct drm_file *file_priv);
141 +int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
142 + struct drm_file *file_priv);
143 int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
144 struct drm_file *file_priv);
145 void v3d_exec_put(struct v3d_exec_info *exec);
146 +void v3d_tfu_job_put(struct v3d_tfu_job *exec);
147 void v3d_reset(struct v3d_dev *v3d);
148 void v3d_invalidate_caches(struct v3d_dev *v3d);
149 void v3d_flush_caches(struct v3d_dev *v3d);
150 --- a/drivers/gpu/drm/v3d/v3d_gem.c
151 +++ b/drivers/gpu/drm/v3d/v3d_gem.c
152 @@ -207,26 +207,27 @@ v3d_flush_caches(struct v3d_dev *v3d)
156 -v3d_attach_object_fences(struct v3d_exec_info *exec)
157 +v3d_attach_object_fences(struct v3d_bo **bos, int bo_count,
158 + struct dma_fence *fence)
160 - struct dma_fence *out_fence = exec->render_done_fence;
163 - for (i = 0; i < exec->bo_count; i++) {
164 + for (i = 0; i < bo_count; i++) {
165 /* XXX: Use shared fences for read-only objects. */
166 - reservation_object_add_excl_fence(exec->bo[i]->resv, out_fence);
167 + reservation_object_add_excl_fence(bos[i]->resv, fence);
172 v3d_unlock_bo_reservations(struct drm_device *dev,
173 - struct v3d_exec_info *exec,
174 + struct v3d_bo **bos,
176 struct ww_acquire_ctx *acquire_ctx)
180 - for (i = 0; i < exec->bo_count; i++)
181 - ww_mutex_unlock(&exec->bo[i]->resv->lock);
182 + for (i = 0; i < bo_count; i++)
183 + ww_mutex_unlock(&bos[i]->resv->lock);
185 ww_acquire_fini(acquire_ctx);
187 @@ -240,7 +241,8 @@ v3d_unlock_bo_reservations(struct drm_de
190 v3d_lock_bo_reservations(struct drm_device *dev,
191 - struct v3d_exec_info *exec,
192 + struct v3d_bo **bos,
194 struct ww_acquire_ctx *acquire_ctx)
196 int contended_lock = -1;
197 @@ -250,7 +252,7 @@ v3d_lock_bo_reservations(struct drm_devi
200 if (contended_lock != -1) {
201 - struct v3d_bo *bo = exec->bo[contended_lock];
202 + struct v3d_bo *bo = bos[contended_lock];
204 ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
206 @@ -260,20 +262,20 @@ retry:
210 - for (i = 0; i < exec->bo_count; i++) {
211 + for (i = 0; i < bo_count; i++) {
212 if (i == contended_lock)
215 - ret = ww_mutex_lock_interruptible(&exec->bo[i]->resv->lock,
216 + ret = ww_mutex_lock_interruptible(&bos[i]->resv->lock,
221 for (j = 0; j < i; j++)
222 - ww_mutex_unlock(&exec->bo[j]->resv->lock);
223 + ww_mutex_unlock(&bos[j]->resv->lock);
225 if (contended_lock != -1 && contended_lock >= i) {
226 - struct v3d_bo *bo = exec->bo[contended_lock];
227 + struct v3d_bo *bo = bos[contended_lock];
229 ww_mutex_unlock(&bo->resv->lock);
231 @@ -293,10 +295,11 @@ retry:
232 /* Reserve space for our shared (read-only) fence references,
233 * before we commit the CL to the hardware.
235 - for (i = 0; i < exec->bo_count; i++) {
236 - ret = reservation_object_reserve_shared(exec->bo[i]->resv);
237 + for (i = 0; i < bo_count; i++) {
238 + ret = reservation_object_reserve_shared(bos[i]->resv);
240 - v3d_unlock_bo_reservations(dev, exec, acquire_ctx);
241 + v3d_unlock_bo_reservations(dev, bos, bo_count,
246 @@ -419,6 +422,33 @@ void v3d_exec_put(struct v3d_exec_info *
247 kref_put(&exec->refcount, v3d_exec_cleanup);
251 +v3d_tfu_job_cleanup(struct kref *ref)
253 + struct v3d_tfu_job *job = container_of(ref, struct v3d_tfu_job,
255 + struct v3d_dev *v3d = job->v3d;
258 + dma_fence_put(job->in_fence);
259 + dma_fence_put(job->done_fence);
261 + for (i = 0; i < ARRAY_SIZE(job->bo); i++) {
263 + drm_gem_object_put_unlocked(&job->bo[i]->base);
266 + pm_runtime_mark_last_busy(v3d->dev);
267 + pm_runtime_put_autosuspend(v3d->dev);
272 +void v3d_tfu_job_put(struct v3d_tfu_job *job)
274 + kref_put(&job->refcount, v3d_tfu_job_cleanup);
278 v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
279 struct drm_file *file_priv)
280 @@ -536,7 +566,8 @@ v3d_submit_cl_ioctl(struct drm_device *d
284 - ret = v3d_lock_bo_reservations(dev, exec, &acquire_ctx);
285 + ret = v3d_lock_bo_reservations(dev, exec->bo, exec->bo_count,
290 @@ -570,9 +601,10 @@ v3d_submit_cl_ioctl(struct drm_device *d
291 &v3d_priv->sched_entity[V3D_RENDER]);
292 mutex_unlock(&v3d->sched_lock);
294 - v3d_attach_object_fences(exec);
295 + v3d_attach_object_fences(exec->bo, exec->bo_count,
296 + exec->render_done_fence);
298 - v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
299 + v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
301 /* Update the return sync object for the */
302 sync_out = drm_syncobj_find(file_priv, args->out_sync);
303 @@ -588,12 +620,118 @@ v3d_submit_cl_ioctl(struct drm_device *d
306 mutex_unlock(&v3d->sched_lock);
307 - v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
308 + v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
316 + * v3d_submit_tfu_ioctl() - Submits a TFU (texture formatting) job to the V3D.
318 + * @data: ioctl argument
319 + * @file_priv: DRM file for this fd
321 + * Userspace provides the register setup for the TFU, which we don't
322 + * need to validate since the TFU is behind the MMU.
325 +v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
326 + struct drm_file *file_priv)
328 + struct v3d_dev *v3d = to_v3d_dev(dev);
329 + struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
330 + struct drm_v3d_submit_tfu *args = data;
331 + struct v3d_tfu_job *job;
332 + struct ww_acquire_ctx acquire_ctx;
333 + struct drm_syncobj *sync_out;
334 + struct dma_fence *sched_done_fence;
338 + job = kcalloc(1, sizeof(*job), GFP_KERNEL);
342 + ret = pm_runtime_get_sync(v3d->dev);
348 + kref_init(&job->refcount);
350 + ret = drm_syncobj_find_fence(file_priv, args->in_sync,
351 + 0, &job->in_fence);
352 + if (ret == -EINVAL)
358 + spin_lock(&file_priv->table_lock);
359 + for (bo_count = 0; bo_count < ARRAY_SIZE(job->bo); bo_count++) {
360 + struct drm_gem_object *bo;
362 + if (!args->bo_handles[bo_count])
365 + bo = idr_find(&file_priv->object_idr,
366 + args->bo_handles[bo_count]);
368 + DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
369 + bo_count, args->bo_handles[bo_count]);
371 + spin_unlock(&file_priv->table_lock);
374 + drm_gem_object_get(bo);
375 + job->bo[bo_count] = to_v3d_bo(bo);
377 + spin_unlock(&file_priv->table_lock);
379 + ret = v3d_lock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
383 + mutex_lock(&v3d->sched_lock);
384 + ret = drm_sched_job_init(&job->base,
385 + &v3d_priv->sched_entity[V3D_TFU],
388 + goto fail_unreserve;
390 + sched_done_fence = dma_fence_get(&job->base.s_fence->finished);
392 + kref_get(&job->refcount); /* put by scheduler job completion */
393 + drm_sched_entity_push_job(&job->base, &v3d_priv->sched_entity[V3D_TFU]);
394 + mutex_unlock(&v3d->sched_lock);
396 + v3d_attach_object_fences(job->bo, bo_count, sched_done_fence);
398 + v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
400 + /* Update the return sync object */
401 + sync_out = drm_syncobj_find(file_priv, args->out_sync);
403 + drm_syncobj_replace_fence(sync_out, sched_done_fence);
404 + drm_syncobj_put(sync_out);
406 + dma_fence_put(sched_done_fence);
408 + v3d_tfu_job_put(job);
413 + mutex_unlock(&v3d->sched_lock);
414 + v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
416 + v3d_tfu_job_put(job);
422 v3d_gem_init(struct drm_device *dev)
423 --- a/drivers/gpu/drm/v3d/v3d_irq.c
424 +++ b/drivers/gpu/drm/v3d/v3d_irq.c
427 * DOC: Interrupt management for the V3D engine
429 - * When we take a binning or rendering flush done interrupt, we need
430 - * to signal the fence for that job so that the scheduler can queue up
431 + * When we take a bin, render, or TFU done interrupt, we need to
432 + * signal the fence for that job so that the scheduler can queue up
433 * the next one and unblock any waiters.
435 * When we take the binner out of memory interrupt, we need to
438 #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV | \
439 V3D_HUB_INT_MMU_PTI | \
440 - V3D_HUB_INT_MMU_CAP))
441 + V3D_HUB_INT_MMU_CAP | \
445 v3d_overflow_mem_work(struct work_struct *work)
446 @@ -117,6 +118,11 @@ v3d_hub_irq(int irq, void *arg)
447 /* Acknowledge the interrupts we're handling here. */
448 V3D_WRITE(V3D_HUB_INT_CLR, intsts);
450 + if (intsts & V3D_HUB_INT_TFUC) {
451 + dma_fence_signal(v3d->tfu_job->done_fence);
452 + status = IRQ_HANDLED;
455 if (intsts & (V3D_HUB_INT_MMU_WRV |
456 V3D_HUB_INT_MMU_PTI |
457 V3D_HUB_INT_MMU_CAP)) {
458 --- a/drivers/gpu/drm/v3d/v3d_regs.h
459 +++ b/drivers/gpu/drm/v3d/v3d_regs.h
461 # define V3D_TOP_GR_BRIDGE_SW_INIT_1 0x0000c
462 # define V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT BIT(0)
464 +#define V3D_TFU_CS 0x00400
465 +/* Stops current job, empties input fifo. */
466 +# define V3D_TFU_CS_TFURST BIT(31)
467 +# define V3D_TFU_CS_CVTCT_MASK V3D_MASK(23, 16)
468 +# define V3D_TFU_CS_CVTCT_SHIFT 16
469 +# define V3D_TFU_CS_NFREE_MASK V3D_MASK(13, 8)
470 +# define V3D_TFU_CS_NFREE_SHIFT 8
471 +# define V3D_TFU_CS_BUSY BIT(0)
473 +#define V3D_TFU_SU 0x00404
474 +/* Interrupt when FINTTHR input slots are free (0 = disabled) */
475 +# define V3D_TFU_SU_FINTTHR_MASK V3D_MASK(13, 8)
476 +# define V3D_TFU_SU_FINTTHR_SHIFT 8
477 +/* Skips resetting the CRC at the start of CRC generation. */
478 +# define V3D_TFU_SU_CRCCHAIN BIT(4)
479 +/* skips writes, computes CRC of the image. miplevels must be 0. */
480 +# define V3D_TFU_SU_CRC BIT(3)
481 +# define V3D_TFU_SU_THROTTLE_MASK V3D_MASK(1, 0)
482 +# define V3D_TFU_SU_THROTTLE_SHIFT 0
484 +#define V3D_TFU_ICFG 0x00408
485 +/* Interrupt when the conversion is complete. */
486 +# define V3D_TFU_ICFG_IOC BIT(0)
488 +/* Input Image Address */
489 +#define V3D_TFU_IIA 0x0040c
490 +/* Input Chroma Address */
491 +#define V3D_TFU_ICA 0x00410
492 +/* Input Image Stride */
493 +#define V3D_TFU_IIS 0x00414
494 +/* Input Image U-Plane Address */
495 +#define V3D_TFU_IUA 0x00418
496 +/* Output Image Address */
497 +#define V3D_TFU_IOA 0x0041c
498 +/* Image Output Size */
499 +#define V3D_TFU_IOS 0x00420
500 +/* TFU YUV Coefficient 0 */
501 +#define V3D_TFU_COEF0 0x00424
502 +/* Use these regs instead of the defaults. */
503 +# define V3D_TFU_COEF0_USECOEF BIT(31)
504 +/* TFU YUV Coefficient 1 */
505 +#define V3D_TFU_COEF1 0x00428
506 +/* TFU YUV Coefficient 2 */
507 +#define V3D_TFU_COEF2 0x0042c
508 +/* TFU YUV Coefficient 3 */
509 +#define V3D_TFU_COEF3 0x00430
511 +#define V3D_TFU_CRC 0x00434
513 /* Per-MMU registers. */
515 #define V3D_MMUC_CONTROL 0x01000
516 --- a/drivers/gpu/drm/v3d/v3d_sched.c
517 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
518 @@ -30,6 +30,12 @@ to_v3d_job(struct drm_sched_job *sched_j
519 return container_of(sched_job, struct v3d_job, base);
522 +static struct v3d_tfu_job *
523 +to_tfu_job(struct drm_sched_job *sched_job)
525 + return container_of(sched_job, struct v3d_tfu_job, base);
529 v3d_job_free(struct drm_sched_job *sched_job)
531 @@ -38,6 +44,14 @@ v3d_job_free(struct drm_sched_job *sched
532 v3d_exec_put(job->exec);
536 +v3d_tfu_job_free(struct drm_sched_job *sched_job)
538 + struct v3d_tfu_job *job = to_tfu_job(sched_job);
540 + v3d_tfu_job_put(job);
544 * Returns the fences that the bin or render job depends on, one by one.
545 * v3d_job_run() won't be called until all of them have been signaled.
546 @@ -76,6 +90,27 @@ v3d_job_dependency(struct drm_sched_job
551 + * Returns the fences that the TFU job depends on, one by one.
552 + * v3d_tfu_job_run() won't be called until all of them have been
555 +static struct dma_fence *
556 +v3d_tfu_job_dependency(struct drm_sched_job *sched_job,
557 + struct drm_sched_entity *s_entity)
559 + struct v3d_tfu_job *job = to_tfu_job(sched_job);
560 + struct dma_fence *fence;
562 + fence = job->in_fence;
564 + job->in_fence = NULL;
571 static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
573 struct v3d_job *job = to_v3d_job(sched_job);
574 @@ -147,31 +182,47 @@ static struct dma_fence *v3d_job_run(str
579 -v3d_job_timedout(struct drm_sched_job *sched_job)
580 +static struct dma_fence *
581 +v3d_tfu_job_run(struct drm_sched_job *sched_job)
583 - struct v3d_job *job = to_v3d_job(sched_job);
584 - struct v3d_exec_info *exec = job->exec;
585 - struct v3d_dev *v3d = exec->v3d;
586 - enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
588 - u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
589 - u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
590 + struct v3d_tfu_job *job = to_tfu_job(sched_job);
591 + struct v3d_dev *v3d = job->v3d;
592 + struct drm_device *dev = &v3d->drm;
593 + struct dma_fence *fence;
595 - /* If the current address or return address have changed, then
596 - * the GPU has probably made progress and we should delay the
597 - * reset. This could fail if the GPU got in an infinite loop
598 - * in the CL, but that is pretty unlikely outside of an i-g-t
601 - if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
602 - job->timedout_ctca = ctca;
603 - job->timedout_ctra = ctra;
604 + fence = v3d_fence_create(v3d, V3D_TFU);
608 - schedule_delayed_work(&job->base.work_tdr,
609 - job->base.sched->timeout);
611 + v3d->tfu_job = job;
612 + if (job->done_fence)
613 + dma_fence_put(job->done_fence);
614 + job->done_fence = dma_fence_get(fence);
616 + trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
618 + V3D_WRITE(V3D_TFU_IIA, job->args.iia);
619 + V3D_WRITE(V3D_TFU_IIS, job->args.iis);
620 + V3D_WRITE(V3D_TFU_ICA, job->args.ica);
621 + V3D_WRITE(V3D_TFU_IUA, job->args.iua);
622 + V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
623 + V3D_WRITE(V3D_TFU_IOS, job->args.ios);
624 + V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
625 + if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
626 + V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
627 + V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
628 + V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
630 + /* ICFG kicks off the job. */
631 + V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
637 +v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
641 mutex_lock(&v3d->reset_lock);
643 @@ -196,6 +247,41 @@ v3d_job_timedout(struct drm_sched_job *s
644 mutex_unlock(&v3d->reset_lock);
648 +v3d_job_timedout(struct drm_sched_job *sched_job)
650 + struct v3d_job *job = to_v3d_job(sched_job);
651 + struct v3d_exec_info *exec = job->exec;
652 + struct v3d_dev *v3d = exec->v3d;
653 + enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
654 + u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
655 + u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
657 + /* If the current address or return address have changed, then
658 + * the GPU has probably made progress and we should delay the
659 + * reset. This could fail if the GPU got in an infinite loop
660 + * in the CL, but that is pretty unlikely outside of an i-g-t
663 + if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
664 + job->timedout_ctca = ctca;
665 + job->timedout_ctra = ctra;
666 + schedule_delayed_work(&job->base.work_tdr,
667 + job->base.sched->timeout);
671 + v3d_gpu_reset_for_timeout(v3d, sched_job);
675 +v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
677 + struct v3d_tfu_job *job = to_tfu_job(sched_job);
679 + v3d_gpu_reset_for_timeout(job->v3d, sched_job);
682 static const struct drm_sched_backend_ops v3d_sched_ops = {
683 .dependency = v3d_job_dependency,
684 .run_job = v3d_job_run,
685 @@ -203,6 +289,13 @@ static const struct drm_sched_backend_op
686 .free_job = v3d_job_free
689 +static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
690 + .dependency = v3d_tfu_job_dependency,
691 + .run_job = v3d_tfu_job_run,
692 + .timedout_job = v3d_tfu_job_timedout,
693 + .free_job = v3d_tfu_job_free
697 v3d_sched_init(struct v3d_dev *v3d)
699 @@ -232,6 +325,19 @@ v3d_sched_init(struct v3d_dev *v3d)
700 drm_sched_fini(&v3d->queue[V3D_BIN].sched);
704 + ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
705 + &v3d_tfu_sched_ops,
706 + hw_jobs_limit, job_hang_limit,
707 + msecs_to_jiffies(hang_limit_ms),
710 + dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
712 + drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
713 + drm_sched_fini(&v3d->queue[V3D_BIN].sched);
719 --- a/drivers/gpu/drm/v3d/v3d_trace.h
720 +++ b/drivers/gpu/drm/v3d/v3d_trace.h
721 @@ -42,6 +42,26 @@ TRACE_EVENT(v3d_submit_cl,
725 +TRACE_EVENT(v3d_submit_tfu,
726 + TP_PROTO(struct drm_device *dev,
728 + TP_ARGS(dev, seqno),
732 + __field(u64, seqno)
736 + __entry->dev = dev->primary->index;
737 + __entry->seqno = seqno;
740 + TP_printk("dev=%u, seqno=%llu",
745 TRACE_EVENT(v3d_reset_begin,
746 TP_PROTO(struct drm_device *dev),
748 --- a/include/uapi/drm/v3d_drm.h
749 +++ b/include/uapi/drm/v3d_drm.h
750 @@ -36,6 +36,7 @@ extern "C" {
751 #define DRM_V3D_MMAP_BO 0x03
752 #define DRM_V3D_GET_PARAM 0x04
753 #define DRM_V3D_GET_BO_OFFSET 0x05
754 +#define DRM_V3D_SUBMIT_TFU 0x06
756 #define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
757 #define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
758 @@ -43,6 +44,7 @@ extern "C" {
759 #define DRM_IOCTL_V3D_MMAP_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
760 #define DRM_IOCTL_V3D_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
761 #define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
762 +#define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
765 * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
766 @@ -169,6 +171,7 @@ enum drm_v3d_param {
767 DRM_V3D_PARAM_V3D_CORE0_IDENT0,
768 DRM_V3D_PARAM_V3D_CORE0_IDENT1,
769 DRM_V3D_PARAM_V3D_CORE0_IDENT2,
770 + DRM_V3D_PARAM_SUPPORTS_TFU,
773 struct drm_v3d_get_param {
774 @@ -187,6 +190,28 @@ struct drm_v3d_get_bo_offset {
778 +struct drm_v3d_submit_tfu {
787 + /* First handle is the output BO, following are other inputs.
790 + __u32 bo_handles[4];
791 + /* sync object to block on before running the TFU job. Each TFU
792 + * job will execute in the order submitted to its FD. Synchronization
793 + * against rendering jobs requires using sync objects.
796 + /* Sync object to signal when the TFU job is done. */
800 #if defined(__cplusplus)