Message ID | 20250225-6-10-rocket-v2-6-d4dbcfafc141@tomeuvizoso.net (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | New DRM accel driver for Rockchip's RKNN NPU | expand |
Hi Am 25.02.25 um 08:55 schrieb Tomeu Vizoso: > Using the DRM GPU scheduler infrastructure, with a scheduler for each > core. > > Userspace can decide for a series of tasks to be executed sequentially > in the same core, so SRAM locality can be taken advantage of. > > The job submission code was initially based on Panfrost. > > v2: > - Remove hardcoded number of cores > - Misc. style fixes (Jeffrey Hugo) > - Repack IOCTL struct (Jeffrey Hugo) > > Signed-off-by: Tomeu Vizoso <tomeu@tomeuvizoso.net> > --- > drivers/accel/rocket/Makefile | 3 +- > drivers/accel/rocket/rocket_core.c | 6 + > drivers/accel/rocket/rocket_core.h | 14 + > drivers/accel/rocket/rocket_device.c | 2 + > drivers/accel/rocket/rocket_device.h | 2 + > drivers/accel/rocket/rocket_drv.c | 15 + > drivers/accel/rocket/rocket_drv.h | 4 + > drivers/accel/rocket/rocket_job.c | 710 +++++++++++++++++++++++++++++++++++ > drivers/accel/rocket/rocket_job.h | 50 +++ > include/uapi/drm/rocket_accel.h | 55 +++ > 10 files changed, 860 insertions(+), 1 deletion(-) > > diff --git a/drivers/accel/rocket/Makefile b/drivers/accel/rocket/Makefile > index 875cac2243d902694e0d5d05e60b4ae551a633c4..4d59036af8d9c213d3cac0559eb66e3ebb0320e7 100644 > --- a/drivers/accel/rocket/Makefile > +++ b/drivers/accel/rocket/Makefile > @@ -6,4 +6,5 @@ rocket-y := \ > rocket_core.o \ > rocket_device.o \ > rocket_drv.o \ > - rocket_gem.o > + rocket_gem.o \ > + rocket_job.o > diff --git a/drivers/accel/rocket/rocket_core.c b/drivers/accel/rocket/rocket_core.c > index 09d966c826b5b1090a18cb24b3aa4aba286a12d4..2b522592693874eed90463e8f85653d5282ae5b8 100644 > --- a/drivers/accel/rocket/rocket_core.c > +++ b/drivers/accel/rocket/rocket_core.c > @@ -6,6 +6,7 @@ > #include <linux/pm_runtime.h> > > #include "rocket_core.h" > +#include "rocket_job.h" > #include "rocket_registers.h" > > static int rocket_clk_init(struct rocket_core *core) > @@ -48,6 +49,10 @@ int rocket_core_init(struct rocket_core *core) > if (IS_ERR(core->iomem)) > return PTR_ERR(core->iomem); > > + err = rocket_job_init(core); > + if (err) > + return err; > + > pm_runtime_use_autosuspend(dev); > pm_runtime_set_autosuspend_delay(dev, 50); /* ~3 frames */ > pm_runtime_enable(dev); > @@ -68,4 +73,5 @@ int rocket_core_init(struct rocket_core *core) > void rocket_core_fini(struct rocket_core *core) > { > pm_runtime_disable(core->dev); > + rocket_job_fini(core); > } > diff --git a/drivers/accel/rocket/rocket_core.h b/drivers/accel/rocket/rocket_core.h > index 2171eba7139ccc63fe24802dc81b4adb7f3abf31..045a46a2010a2ffd6122ed86c379e5fabc70365a 100644 > --- a/drivers/accel/rocket/rocket_core.h > +++ b/drivers/accel/rocket/rocket_core.h > @@ -21,6 +21,20 @@ struct rocket_core { > void __iomem *iomem; > struct clk *a_clk; > struct clk *h_clk; > + > + struct rocket_job *in_flight_job; > + > + spinlock_t job_lock; > + > + struct { > + struct workqueue_struct *wq; > + struct work_struct work; > + atomic_t pending; > + } reset; > + > + struct drm_gpu_scheduler sched; > + u64 fence_context; > + u64 emit_seqno; > }; > > int rocket_core_init(struct rocket_core *core); > diff --git a/drivers/accel/rocket/rocket_device.c b/drivers/accel/rocket/rocket_device.c > index 9af36357caba7148dcac764c8222699f3b572d60..62c640e1e0200fe25b6834e45d71f6de139ff3ab 100644 > --- a/drivers/accel/rocket/rocket_device.c > +++ b/drivers/accel/rocket/rocket_device.c > @@ -12,6 +12,7 @@ int rocket_device_init(struct rocket_device *rdev) > int err; > > mutex_init(&rdev->iommu_lock); > + mutex_init(&rdev->sched_lock); > > rdev->clk_npu = devm_clk_get(dev, "npu"); > rdev->pclk = devm_clk_get(dev, "pclk"); > @@ -29,5 +30,6 @@ int rocket_device_init(struct rocket_device *rdev) > void rocket_device_fini(struct rocket_device *rdev) > { > rocket_core_fini(&rdev->cores[0]); > + mutex_destroy(&rdev->sched_lock); > mutex_destroy(&rdev->iommu_lock); > } > diff --git a/drivers/accel/rocket/rocket_device.h b/drivers/accel/rocket/rocket_device.h > index c6152569fdd9e5587c8e8d7b0d7c2e2a77af6000..4168ae8da2d38c2ea114b37c6e053b02611a0232 100644 > --- a/drivers/accel/rocket/rocket_device.h > +++ b/drivers/accel/rocket/rocket_device.h > @@ -11,6 +11,8 @@ > struct rocket_device { > struct drm_device ddev; > > + struct mutex sched_lock; > + > struct clk *clk_npu; > struct clk *pclk; > > diff --git a/drivers/accel/rocket/rocket_drv.c b/drivers/accel/rocket/rocket_drv.c > index e5612b52952fa7a0cd0af02aef314984bc483b05..a6b486e2d4f648d7b1d8831590b633bf661c7bc4 100644 > --- a/drivers/accel/rocket/rocket_drv.c > +++ b/drivers/accel/rocket/rocket_drv.c > @@ -16,12 +16,14 @@ > > #include "rocket_drv.h" > #include "rocket_gem.h" > +#include "rocket_job.h" > > static int > rocket_open(struct drm_device *dev, struct drm_file *file) > { > struct rocket_device *rdev = to_rocket_device(dev); > struct rocket_file_priv *rocket_priv; > + int ret; > > rocket_priv = kzalloc(sizeof(*rocket_priv), GFP_KERNEL); > if (!rocket_priv) > @@ -30,7 +32,15 @@ rocket_open(struct drm_device *dev, struct drm_file *file) > rocket_priv->rdev = rdev; > file->driver_priv = rocket_priv; > > + ret = rocket_job_open(rocket_priv); > + if (ret) > + goto err_free; > + > return 0; > + > +err_free: > + kfree(rocket_priv); > + return ret; > } > > static void > @@ -38,6 +48,7 @@ rocket_postclose(struct drm_device *dev, struct drm_file *file) > { > struct rocket_file_priv *rocket_priv = file->driver_priv; > > + rocket_job_close(rocket_priv); > kfree(rocket_priv); > } > > @@ -46,6 +57,7 @@ static const struct drm_ioctl_desc rocket_drm_driver_ioctls[] = { > DRM_IOCTL_DEF_DRV(ROCKET_##n, rocket_ioctl_##func, 0) > > ROCKET_IOCTL(CREATE_BO, create_bo), > + ROCKET_IOCTL(SUBMIT, submit), > }; > > DEFINE_DRM_ACCEL_FOPS(rocket_accel_driver_fops); > @@ -245,6 +257,9 @@ static int rocket_device_runtime_suspend(struct device *dev) > if (dev != rdev->cores[core].dev) > continue; > > + if (!rocket_job_is_idle(&rdev->cores[core])) > + return -EBUSY; > + > clk_disable_unprepare(rdev->cores[core].a_clk); > clk_disable_unprepare(rdev->cores[core].h_clk); > > diff --git a/drivers/accel/rocket/rocket_drv.h b/drivers/accel/rocket/rocket_drv.h > index ccdd50c69d4c033eea18cb800407fdcfb3bf2e9b..54e21a61006057aee293496016e54b495a2f6d55 100644 > --- a/drivers/accel/rocket/rocket_drv.h > +++ b/drivers/accel/rocket/rocket_drv.h > @@ -4,10 +4,14 @@ > #ifndef __ROCKET_DRV_H__ > #define __ROCKET_DRV_H__ > > +#include <drm/gpu_scheduler.h> > + > #include "rocket_device.h" > > struct rocket_file_priv { > struct rocket_device *rdev; > + > + struct drm_sched_entity sched_entity; > }; > > #endif > diff --git a/drivers/accel/rocket/rocket_job.c b/drivers/accel/rocket/rocket_job.c > new file mode 100644 > index 0000000000000000000000000000000000000000..25b31f28e932aaee86173b9a0962932c9c640c03 > --- /dev/null > +++ b/drivers/accel/rocket/rocket_job.c > @@ -0,0 +1,710 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */ > +/* Copyright 2019 Collabora ltd. */ > +/* Copyright 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> */ > + > +#include <drm/drm_file.h> > +#include <drm/drm_gem.h> > +#include <drm/rocket_accel.h> > +#include <linux/interrupt.h> > +#include <linux/platform_device.h> > +#include <linux/pm_runtime.h> > + > +#include "rocket_core.h" > +#include "rocket_device.h" > +#include "rocket_drv.h" > +#include "rocket_job.h" > +#include "rocket_registers.h" > + > +#define JOB_TIMEOUT_MS 500 > + > +#define job_write(dev, reg, data) writel(data, dev->iomem + (reg)) > +#define job_read(dev, reg) readl(dev->iomem + (reg)) > + > +static struct rocket_job * > +to_rocket_job(struct drm_sched_job *sched_job) > +{ > + return container_of(sched_job, struct rocket_job, base); > +} > + > +struct rocket_fence { > + struct dma_fence base; > + struct drm_device *dev; > + /* rocket seqno for signaled() test */ > + u64 seqno; > + int queue; > +}; > + > +static inline struct rocket_fence * > +to_rocket_fence(struct dma_fence *fence) > +{ > + return (struct rocket_fence *)fence; Proper upcast with container_of() please. > +} > + > +static const char *rocket_fence_get_driver_name(struct dma_fence *fence) > +{ > + return "rocket"; > +} > + > +static const char *rocket_fence_get_timeline_name(struct dma_fence *fence) > +{ > + return "rockchip-npu"; > +} > + > +static const struct dma_fence_ops rocket_fence_ops = { > + .get_driver_name = rocket_fence_get_driver_name, > + .get_timeline_name = rocket_fence_get_timeline_name, > +}; > + > +static struct dma_fence *rocket_fence_create(struct rocket_core *core) > +{ > + struct rocket_device *rdev = core->rdev; > + struct rocket_fence *fence; > + > + fence = kzalloc(sizeof(*fence), GFP_KERNEL); > + if (!fence) > + return ERR_PTR(-ENOMEM); > + > + fence->dev = &rdev->ddev; > + fence->seqno = ++core->emit_seqno; > + dma_fence_init(&fence->base, &rocket_fence_ops, &core->job_lock, > + core->fence_context, fence->seqno); > + > + return &fence->base; > +} > + > +static int > +rocket_copy_tasks(struct drm_device *dev, > + struct drm_file *file_priv, > + struct drm_rocket_job *job, > + struct rocket_job *rjob) > +{ > + struct drm_rocket_task *tasks; > + int ret = 0; > + int i; > + > + rjob->task_count = job->task_count; > + > + if (!rjob->task_count) > + return 0; > + > + tasks = kvmalloc_array(rjob->task_count, sizeof(*tasks), GFP_KERNEL); > + if (!tasks) { > + ret = -ENOMEM; > + DRM_DEBUG("Failed to allocate incoming tasks\n"); These logging macros are deprecated. Please use drm_dbg() and similar functions instead. Here and everywhere else. > + goto fail; > + } > + > + if (copy_from_user(tasks, > + (void __user *)(uintptr_t)job->tasks, > + rjob->task_count * sizeof(*tasks))) { > + ret = -EFAULT; > + DRM_DEBUG("Failed to copy incoming tasks\n"); > + goto fail; > + } > + > + rjob->tasks = kvmalloc_array(job->task_count, sizeof(*rjob->tasks), GFP_KERNEL); > + if (!rjob->tasks) { > + DRM_DEBUG("Failed to allocate task array\n"); > + ret = -ENOMEM; > + goto fail; > + } > + > + for (i = 0; i < rjob->task_count; i++) { > + if (tasks[i].regcmd_count == 0) { > + ret = -EINVAL; > + goto fail; > + } > + rjob->tasks[i].regcmd = tasks[i].regcmd; > + rjob->tasks[i].regcmd_count = tasks[i].regcmd_count; > + } > + > +fail: > + kvfree(tasks); > + return ret; > +} > + > +static void rocket_job_hw_submit(struct rocket_core *core, struct rocket_job *job) > +{ > + struct rocket_task *task; > + bool task_pp_en = 1; > + bool task_count = 1; > + > + /* GO ! */ > + > + /* Don't queue the job if a reset is in progress */ > + if (!atomic_read(&core->reset.pending)) { > + > + task = &job->tasks[job->next_task_idx]; > + job->next_task_idx++; /* TODO: Do this only after a successful run? */ > + > + rocket_write(core, REG_PC_BASE_ADDRESS, 0x1); Just 'write' seems a little imprecise. Maybe 'writel' would be better. Best regards Thomas > + > + rocket_write(core, REG_CNA_S_POINTER, 0xe + 0x10000000 * core->index); > + rocket_write(core, REG_CORE_S_POINTER, 0xe + 0x10000000 * core->index); > + > + rocket_write(core, REG_PC_BASE_ADDRESS, task->regcmd); > + rocket_write(core, REG_PC_REGISTER_AMOUNTS, (task->regcmd_count + 1) / 2 - 1); > + > + rocket_write(core, REG_PC_INTERRUPT_MASK, > + PC_INTERRUPT_MASK_DPU_0 | PC_INTERRUPT_MASK_DPU_1); > + rocket_write(core, REG_PC_INTERRUPT_CLEAR, > + PC_INTERRUPT_CLEAR_DPU_0 | PC_INTERRUPT_CLEAR_DPU_1); > + > + rocket_write(core, REG_PC_TASK_CON, ((0x6 | task_pp_en) << 12) | task_count); > + > + rocket_write(core, REG_PC_TASK_DMA_BASE_ADDR, 0x0); > + > + rocket_write(core, REG_PC_OPERATION_ENABLE, 0x1); > + > + dev_dbg(core->dev, > + "Submitted regcmd at 0x%llx to core %d", > + task->regcmd, core->index); > + } > +} > + > +static int rocket_acquire_object_fences(struct drm_gem_object **bos, > + int bo_count, > + struct drm_sched_job *job, > + bool is_write) > +{ > + int i, ret; > + > + for (i = 0; i < bo_count; i++) { > + ret = dma_resv_reserve_fences(bos[i]->resv, 1); > + if (ret) > + return ret; > + > + ret = drm_sched_job_add_implicit_dependencies(job, bos[i], > + is_write); > + if (ret) > + return ret; > + } > + > + return 0; > +} > + > +static void rocket_attach_object_fences(struct drm_gem_object **bos, > + int bo_count, > + struct dma_fence *fence) > +{ > + int i; > + > + for (i = 0; i < bo_count; i++) > + dma_resv_add_fence(bos[i]->resv, fence, DMA_RESV_USAGE_WRITE); > +} > + > +static int rocket_job_push(struct rocket_job *job) > +{ > + struct rocket_device *rdev = job->rdev; > + struct drm_gem_object **bos; > + struct ww_acquire_ctx acquire_ctx; > + int ret = 0; > + > + bos = kvmalloc_array(job->in_bo_count + job->out_bo_count, sizeof(void *), > + GFP_KERNEL); > + memcpy(bos, job->in_bos, job->in_bo_count * sizeof(void *)); > + memcpy(&bos[job->in_bo_count], job->out_bos, job->out_bo_count * sizeof(void *)); > + > + ret = drm_gem_lock_reservations(bos, job->in_bo_count + job->out_bo_count, &acquire_ctx); > + if (ret) > + goto err; > + > + mutex_lock(&rdev->sched_lock); > + drm_sched_job_arm(&job->base); > + > + job->inference_done_fence = dma_fence_get(&job->base.s_fence->finished); > + > + ret = rocket_acquire_object_fences(job->in_bos, job->in_bo_count, &job->base, false); > + if (ret) { > + mutex_unlock(&rdev->sched_lock); > + goto err_unlock; > + } > + > + ret = rocket_acquire_object_fences(job->out_bos, job->out_bo_count, &job->base, true); > + if (ret) { > + mutex_unlock(&rdev->sched_lock); > + goto err_unlock; > + } > + > + kref_get(&job->refcount); /* put by scheduler job completion */ > + > + drm_sched_entity_push_job(&job->base); > + > + mutex_unlock(&rdev->sched_lock); > + > + rocket_attach_object_fences(job->out_bos, job->out_bo_count, job->inference_done_fence); > + > +err_unlock: > + drm_gem_unlock_reservations(bos, job->in_bo_count + job->out_bo_count, &acquire_ctx); > +err: > + kfree(bos); > + > + return ret; > +} > + > +static void rocket_job_cleanup(struct kref *ref) > +{ > + struct rocket_job *job = container_of(ref, struct rocket_job, > + refcount); > + unsigned int i; > + > + dma_fence_put(job->done_fence); > + dma_fence_put(job->inference_done_fence); > + > + if (job->in_bos) { > + for (i = 0; i < job->in_bo_count; i++) > + drm_gem_object_put(job->in_bos[i]); > + > + kvfree(job->in_bos); > + } > + > + if (job->out_bos) { > + for (i = 0; i < job->out_bo_count; i++) > + drm_gem_object_put(job->out_bos[i]); > + > + kvfree(job->out_bos); > + } > + > + kfree(job->tasks); > + > + kfree(job); > +} > + > +static void rocket_job_put(struct rocket_job *job) > +{ > + kref_put(&job->refcount, rocket_job_cleanup); > +} > + > +static void rocket_job_free(struct drm_sched_job *sched_job) > +{ > + struct rocket_job *job = to_rocket_job(sched_job); > + > + drm_sched_job_cleanup(sched_job); > + > + rocket_job_put(job); > +} > + > +static struct rocket_core *sched_to_core(struct rocket_device *rdev, > + struct drm_gpu_scheduler *sched) > +{ > + unsigned int core; > + > + for (core = 0; core < rdev->num_cores; core++) { > + if (&rdev->cores[core].sched == sched) > + return &rdev->cores[core]; > + } > + > + return NULL; > +} > + > +static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job) > +{ > + struct rocket_job *job = to_rocket_job(sched_job); > + struct rocket_device *rdev = job->rdev; > + struct rocket_core *core = sched_to_core(rdev, sched_job->sched); > + struct dma_fence *fence = NULL; > + int ret; > + > + if (unlikely(job->base.s_fence->finished.error)) > + return NULL; > + > + /* > + * Nothing to execute: can happen if the job has finished while > + * we were resetting the GPU. > + */ > + if (job->next_task_idx == job->task_count) > + return NULL; > + > + fence = rocket_fence_create(core); > + if (IS_ERR(fence)) > + return fence; > + > + if (job->done_fence) > + dma_fence_put(job->done_fence); > + job->done_fence = dma_fence_get(fence); > + > + ret = pm_runtime_get_sync(core->dev); > + if (ret < 0) > + return fence; > + > + spin_lock(&core->job_lock); > + > + core->in_flight_job = job; > + rocket_job_hw_submit(core, job); > + > + spin_unlock(&core->job_lock); > + > + return fence; > +} > + > +static void rocket_job_handle_done(struct rocket_core *core, > + struct rocket_job *job) > +{ > + if (job->next_task_idx < job->task_count) { > + rocket_job_hw_submit(core, job); > + return; > + } > + > + core->in_flight_job = NULL; > + dma_fence_signal_locked(job->done_fence); > + pm_runtime_put_autosuspend(core->dev); > +} > + > +static void rocket_job_handle_irq(struct rocket_core *core) > +{ > + uint32_t status, raw_status; > + > + pm_runtime_mark_last_busy(core->dev); > + > + status = rocket_read(core, REG_PC_INTERRUPT_STATUS); > + raw_status = rocket_read(core, REG_PC_INTERRUPT_RAW_STATUS); > + > + rocket_write(core, REG_PC_OPERATION_ENABLE, 0x0); > + rocket_write(core, REG_PC_INTERRUPT_CLEAR, 0x1ffff); > + > + spin_lock(&core->job_lock); > + > + if (core->in_flight_job) > + rocket_job_handle_done(core, core->in_flight_job); > + > + spin_unlock(&core->job_lock); > +} > + > +static void > +rocket_reset(struct rocket_core *core, struct drm_sched_job *bad) > +{ > + bool cookie; > + > + if (!atomic_read(&core->reset.pending)) > + return; > + > + /* > + * Stop the scheduler. > + * > + * FIXME: We temporarily get out of the dma_fence_signalling section > + * because the cleanup path generate lockdep splats when taking locks > + * to release job resources. We should rework the code to follow this > + * pattern: > + * > + * try_lock > + * if (locked) > + * release > + * else > + * schedule_work_to_release_later > + */ > + drm_sched_stop(&core->sched, bad); > + > + cookie = dma_fence_begin_signalling(); > + > + if (bad) > + drm_sched_increase_karma(bad); > + > + /* > + * Mask job interrupts and synchronize to make sure we won't be > + * interrupted during our reset. > + */ > + rocket_write(core, REG_PC_INTERRUPT_MASK, 0x0); > + synchronize_irq(core->irq); > + > + /* Handle the remaining interrupts before we reset. */ > + rocket_job_handle_irq(core); > + > + /* > + * Remaining interrupts have been handled, but we might still have > + * stuck jobs. Let's make sure the PM counters stay balanced by > + * manually calling pm_runtime_put_noidle() and > + * rocket_devfreq_record_idle() for each stuck job. > + * Let's also make sure the cycle counting register's refcnt is > + * kept balanced to prevent it from running forever > + */ > + spin_lock(&core->job_lock); > + if (core->in_flight_job) > + pm_runtime_put_noidle(core->dev); > + > + core->in_flight_job = NULL; > + spin_unlock(&core->job_lock); > + > + /* Proceed with reset now. */ > + pm_runtime_force_suspend(core->dev); > + pm_runtime_force_resume(core->dev); > + > + /* GPU has been reset, we can clear the reset pending bit. */ > + atomic_set(&core->reset.pending, 0); > + > + /* > + * Now resubmit jobs that were previously queued but didn't have a > + * chance to finish. > + * FIXME: We temporarily get out of the DMA fence signalling section > + * while resubmitting jobs because the job submission logic will > + * allocate memory with the GFP_KERNEL flag which can trigger memory > + * reclaim and exposes a lock ordering issue. > + */ > + dma_fence_end_signalling(cookie); > + drm_sched_resubmit_jobs(&core->sched); > + cookie = dma_fence_begin_signalling(); > + > + /* Restart the scheduler */ > + drm_sched_start(&core->sched, 0); > + > + dma_fence_end_signalling(cookie); > +} > + > +static enum drm_gpu_sched_stat rocket_job_timedout(struct drm_sched_job *sched_job) > +{ > + struct rocket_job *job = to_rocket_job(sched_job); > + struct rocket_device *rdev = job->rdev; > + struct rocket_core *core = sched_to_core(rdev, sched_job->sched); > + > + /* > + * If the GPU managed to complete this jobs fence, the timeout is > + * spurious. Bail out. > + */ > + if (dma_fence_is_signaled(job->done_fence)) > + return DRM_GPU_SCHED_STAT_NOMINAL; > + > + /* > + * Rocket IRQ handler may take a long time to process an interrupt > + * if there is another IRQ handler hogging the processing. > + * For example, the HDMI encoder driver might be stuck in the IRQ > + * handler for a significant time in a case of bad cable connection. > + * In order to catch such cases and not report spurious rocket > + * job timeouts, synchronize the IRQ handler and re-check the fence > + * status. > + */ > + synchronize_irq(core->irq); > + > + if (dma_fence_is_signaled(job->done_fence)) { > + dev_warn(core->dev, "unexpectedly high interrupt latency\n"); > + return DRM_GPU_SCHED_STAT_NOMINAL; > + } > + > + dev_err(core->dev, "gpu sched timeout"); > + > + atomic_set(&core->reset.pending, 1); > + rocket_reset(core, sched_job); > + > + return DRM_GPU_SCHED_STAT_NOMINAL; > +} > + > +static void rocket_reset_work(struct work_struct *work) > +{ > + struct rocket_core *core; > + > + core = container_of(work, struct rocket_core, reset.work); > + rocket_reset(core, NULL); > +} > + > +static const struct drm_sched_backend_ops rocket_sched_ops = { > + .run_job = rocket_job_run, > + .timedout_job = rocket_job_timedout, > + .free_job = rocket_job_free > +}; > + > +static irqreturn_t rocket_job_irq_handler_thread(int irq, void *data) > +{ > + struct rocket_core *core = data; > + > + rocket_job_handle_irq(core); > + > + return IRQ_HANDLED; > +} > + > +static irqreturn_t rocket_job_irq_handler(int irq, void *data) > +{ > + struct rocket_core *core = data; > + uint32_t raw_status = rocket_read(core, REG_PC_INTERRUPT_RAW_STATUS); > + > + WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR); > + WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR); > + > + if (!(raw_status & PC_INTERRUPT_RAW_STATUS_DPU_0 || > + raw_status & PC_INTERRUPT_RAW_STATUS_DPU_1)) > + return IRQ_NONE; > + > + rocket_write(core, REG_PC_INTERRUPT_MASK, 0x0); > + > + return IRQ_WAKE_THREAD; > +} > + > +int rocket_job_init(struct rocket_core *core) > +{ > + int ret; > + > + INIT_WORK(&core->reset.work, rocket_reset_work); > + spin_lock_init(&core->job_lock); > + > + core->irq = platform_get_irq(to_platform_device(core->dev), 0); > + if (core->irq < 0) > + return core->irq; > + > + ret = devm_request_threaded_irq(core->dev, core->irq, > + rocket_job_irq_handler, > + rocket_job_irq_handler_thread, > + IRQF_SHARED, KBUILD_MODNAME "-job", > + core); > + if (ret) { > + dev_err(core->dev, "failed to request job irq"); > + return ret; > + } > + > + core->reset.wq = alloc_ordered_workqueue("rocket-reset-%d", 0, core->index); > + if (!core->reset.wq) > + return -ENOMEM; > + > + core->fence_context = dma_fence_context_alloc(1); > + > + ret = drm_sched_init(&core->sched, > + &rocket_sched_ops, NULL, > + DRM_SCHED_PRIORITY_COUNT, > + 1, 0, > + msecs_to_jiffies(JOB_TIMEOUT_MS), > + core->reset.wq, > + NULL, "rocket", core->dev); > + if (ret) { > + dev_err(core->dev, "Failed to create scheduler: %d.", ret); > + goto err_sched; > + } > + > + return 0; > + > +err_sched: > + drm_sched_fini(&core->sched); > + > + destroy_workqueue(core->reset.wq); > + return ret; > +} > + > +void rocket_job_fini(struct rocket_core *core) > +{ > + drm_sched_fini(&core->sched); > + > + cancel_work_sync(&core->reset.work); > + destroy_workqueue(core->reset.wq); > +} > + > +int rocket_job_open(struct rocket_file_priv *rocket_priv) > +{ > + struct rocket_device *rdev = rocket_priv->rdev; > + struct drm_gpu_scheduler **scheds = kmalloc_array(rdev->num_cores, sizeof(scheds), > + GFP_KERNEL); > + unsigned int core; > + int ret; > + > + for (core = 0; core < rdev->num_cores; core++) > + scheds[core] = &rdev->cores[core].sched; > + > + ret = drm_sched_entity_init(&rocket_priv->sched_entity, > + DRM_SCHED_PRIORITY_NORMAL, > + scheds, > + rdev->num_cores, NULL); > + if (WARN_ON(ret)) > + return ret; > + > + return 0; > +} > + > +void rocket_job_close(struct rocket_file_priv *rocket_priv) > +{ > + struct drm_sched_entity *entity = &rocket_priv->sched_entity; > + > + kfree(entity->sched_list); > + drm_sched_entity_destroy(entity); > +} > + > +int rocket_job_is_idle(struct rocket_core *core) > +{ > + /* If there are any jobs in this HW queue, we're not idle */ > + if (atomic_read(&core->sched.credit_count)) > + return false; > + > + return true; > +} > + > +static int rocket_ioctl_submit_job(struct drm_device *dev, struct drm_file *file, > + struct drm_rocket_job *job) > +{ > + struct rocket_device *rdev = to_rocket_device(dev); > + struct rocket_file_priv *file_priv = file->driver_priv; > + struct rocket_job *rjob = NULL; > + int ret = 0; > + > + if (job->task_count == 0) > + return -EINVAL; > + > + rjob = kzalloc(sizeof(*rjob), GFP_KERNEL); > + if (!rjob) > + return -ENOMEM; > + > + kref_init(&rjob->refcount); > + > + rjob->rdev = rdev; > + > + ret = drm_sched_job_init(&rjob->base, > + &file_priv->sched_entity, > + 1, NULL); > + if (ret) > + goto out_put_job; > + > + ret = rocket_copy_tasks(dev, file, job, rjob); > + if (ret) > + goto out_cleanup_job; > + > + ret = drm_gem_objects_lookup(file, > + (void __user *)(uintptr_t)job->in_bo_handles, > + job->in_bo_handle_count, &rjob->in_bos); > + if (ret) > + goto out_cleanup_job; > + > + rjob->in_bo_count = job->in_bo_handle_count; > + > + ret = drm_gem_objects_lookup(file, > + (void __user *)(uintptr_t)job->out_bo_handles, > + job->out_bo_handle_count, &rjob->out_bos); > + if (ret) > + goto out_cleanup_job; > + > + rjob->out_bo_count = job->out_bo_handle_count; > + > + ret = rocket_job_push(rjob); > + if (ret) > + goto out_cleanup_job; > + > +out_cleanup_job: > + if (ret) > + drm_sched_job_cleanup(&rjob->base); > +out_put_job: > + rocket_job_put(rjob); > + > + return ret; > +} > + > +int rocket_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file) > +{ > + struct drm_rocket_submit *args = data; > + struct drm_rocket_job *jobs; > + int ret = 0; > + unsigned int i = 0; > + > + jobs = kvmalloc_array(args->job_count, sizeof(*jobs), GFP_KERNEL); > + if (!jobs) { > + DRM_DEBUG("Failed to allocate incoming job array\n"); > + return -ENOMEM; > + } > + > + if (copy_from_user(jobs, > + (void __user *)(uintptr_t)args->jobs, > + args->job_count * sizeof(*jobs))) { > + ret = -EFAULT; > + DRM_DEBUG("Failed to copy incoming job array\n"); > + goto exit; > + } > + > + for (i = 0; i < args->job_count; i++) > + rocket_ioctl_submit_job(dev, file, &jobs[i]); > + > +exit: > + kfree(jobs); > + > + return ret; > +} > diff --git a/drivers/accel/rocket/rocket_job.h b/drivers/accel/rocket/rocket_job.h > new file mode 100644 > index 0000000000000000000000000000000000000000..93fa1f988c72adb7a405acbf08c1c9b87d22f9c5 > --- /dev/null > +++ b/drivers/accel/rocket/rocket_job.h > @@ -0,0 +1,50 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* Copyright 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> */ > + > +#ifndef __ROCKET_JOB_H__ > +#define __ROCKET_JOB_H__ > + > +#include <drm/drm_drv.h> > +#include <drm/gpu_scheduler.h> > + > +#include "rocket_core.h" > +#include "rocket_drv.h" > + > +struct rocket_task { > + u64 regcmd; > + u32 regcmd_count; > +}; > + > +struct rocket_job { > + struct drm_sched_job base; > + > + struct rocket_device *rdev; > + > + struct drm_gem_object **in_bos; > + struct drm_gem_object **out_bos; > + > + u32 in_bo_count; > + u32 out_bo_count; > + > + struct rocket_task *tasks; > + u32 task_count; > + u32 next_task_idx; > + > + /* Fence to be signaled by drm-sched once its done with the job */ > + struct dma_fence *inference_done_fence; > + > + /* Fence to be signaled by IRQ handler when the job is complete. */ > + struct dma_fence *done_fence; > + > + struct kref refcount; > +}; > + > +int rocket_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file); > + > +int rocket_job_init(struct rocket_core *core); > +void rocket_job_fini(struct rocket_core *core); > +int rocket_job_open(struct rocket_file_priv *rocket_priv); > +void rocket_job_close(struct rocket_file_priv *rocket_priv); > +int rocket_job_is_idle(struct rocket_core *core); > + > +#endif > diff --git a/include/uapi/drm/rocket_accel.h b/include/uapi/drm/rocket_accel.h > index 8338726a83c31b954608ca505cf78bcd70d3494b..eb886351134ebef62969b1e1182ccc174f88fe9d 100644 > --- a/include/uapi/drm/rocket_accel.h > +++ b/include/uapi/drm/rocket_accel.h > @@ -12,8 +12,10 @@ extern "C" { > #endif > > #define DRM_ROCKET_CREATE_BO 0x00 > +#define DRM_ROCKET_SUBMIT 0x01 > > #define DRM_IOCTL_ROCKET_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo) > +#define DRM_IOCTL_ROCKET_SUBMIT DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit) > > /** > * struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs. > @@ -36,6 +38,59 @@ struct drm_rocket_create_bo { > __u64 offset; > }; > > +/** > + * struct drm_rocket_task - A task to be run on the NPU > + * > + * A task is the smallest unit of work that can be run on the NPU. > + */ > +struct drm_rocket_task { > + /** DMA address to NPU mapping of register command buffer */ > + __u64 regcmd; > + > + /** Number of commands in the register command buffer */ > + __u32 regcmd_count; > +}; > + > +/** > + * struct drm_rocket_job - A job to be run on the NPU > + * > + * The kernel will schedule the execution of this job taking into account its > + * dependencies with other jobs. All tasks in the same job will be executed > + * sequentially on the same core, to benefit from memory residency in SRAM. > + */ > +struct drm_rocket_job { > + /** Pointer to an array of struct drm_rocket_task. */ > + __u64 tasks; > + > + /** Pointer to a u32 array of the BOs that are read by the job. */ > + __u64 in_bo_handles; > + > + /** Pointer to a u32 array of the BOs that are written to by the job. */ > + __u64 out_bo_handles; > + > + /** Number of tasks passed in. */ > + __u32 task_count; > + > + /** Number of input BO handles passed in (size is that times 4). */ > + __u32 in_bo_handle_count; > + > + /** Number of output BO handles passed in (size is that times 4). */ > + __u32 out_bo_handle_count; > +}; > + > +/** > + * struct drm_rocket_submit - ioctl argument for submitting commands to the NPU. > + * > + * The kernel will schedule the execution of these jobs in dependency order. > + */ > +struct drm_rocket_submit { > + /** Pointer to an array of struct drm_rocket_job. */ > + __u64 jobs; > + > + /** Number of jobs passed in. */ > + __u32 job_count; > +}; > + > #if defined(__cplusplus) > } > #endif >
diff --git a/drivers/accel/rocket/Makefile b/drivers/accel/rocket/Makefile index 875cac2243d902694e0d5d05e60b4ae551a633c4..4d59036af8d9c213d3cac0559eb66e3ebb0320e7 100644 --- a/drivers/accel/rocket/Makefile +++ b/drivers/accel/rocket/Makefile @@ -6,4 +6,5 @@ rocket-y := \ rocket_core.o \ rocket_device.o \ rocket_drv.o \ - rocket_gem.o + rocket_gem.o \ + rocket_job.o diff --git a/drivers/accel/rocket/rocket_core.c b/drivers/accel/rocket/rocket_core.c index 09d966c826b5b1090a18cb24b3aa4aba286a12d4..2b522592693874eed90463e8f85653d5282ae5b8 100644 --- a/drivers/accel/rocket/rocket_core.c +++ b/drivers/accel/rocket/rocket_core.c @@ -6,6 +6,7 @@ #include <linux/pm_runtime.h> #include "rocket_core.h" +#include "rocket_job.h" #include "rocket_registers.h" static int rocket_clk_init(struct rocket_core *core) @@ -48,6 +49,10 @@ int rocket_core_init(struct rocket_core *core) if (IS_ERR(core->iomem)) return PTR_ERR(core->iomem); + err = rocket_job_init(core); + if (err) + return err; + pm_runtime_use_autosuspend(dev); pm_runtime_set_autosuspend_delay(dev, 50); /* ~3 frames */ pm_runtime_enable(dev); @@ -68,4 +73,5 @@ int rocket_core_init(struct rocket_core *core) void rocket_core_fini(struct rocket_core *core) { pm_runtime_disable(core->dev); + rocket_job_fini(core); } diff --git a/drivers/accel/rocket/rocket_core.h b/drivers/accel/rocket/rocket_core.h index 2171eba7139ccc63fe24802dc81b4adb7f3abf31..045a46a2010a2ffd6122ed86c379e5fabc70365a 100644 --- a/drivers/accel/rocket/rocket_core.h +++ b/drivers/accel/rocket/rocket_core.h @@ -21,6 +21,20 @@ struct rocket_core { void __iomem *iomem; struct clk *a_clk; struct clk *h_clk; + + struct rocket_job *in_flight_job; + + spinlock_t job_lock; + + struct { + struct workqueue_struct *wq; + struct work_struct work; + atomic_t pending; + } reset; + + struct drm_gpu_scheduler sched; + u64 fence_context; + u64 emit_seqno; }; int rocket_core_init(struct rocket_core *core); diff --git a/drivers/accel/rocket/rocket_device.c b/drivers/accel/rocket/rocket_device.c index 9af36357caba7148dcac764c8222699f3b572d60..62c640e1e0200fe25b6834e45d71f6de139ff3ab 100644 --- a/drivers/accel/rocket/rocket_device.c +++ b/drivers/accel/rocket/rocket_device.c @@ -12,6 +12,7 @@ int rocket_device_init(struct rocket_device *rdev) int err; mutex_init(&rdev->iommu_lock); + mutex_init(&rdev->sched_lock); rdev->clk_npu = devm_clk_get(dev, "npu"); rdev->pclk = devm_clk_get(dev, "pclk"); @@ -29,5 +30,6 @@ int rocket_device_init(struct rocket_device *rdev) void rocket_device_fini(struct rocket_device *rdev) { rocket_core_fini(&rdev->cores[0]); + mutex_destroy(&rdev->sched_lock); mutex_destroy(&rdev->iommu_lock); } diff --git a/drivers/accel/rocket/rocket_device.h b/drivers/accel/rocket/rocket_device.h index c6152569fdd9e5587c8e8d7b0d7c2e2a77af6000..4168ae8da2d38c2ea114b37c6e053b02611a0232 100644 --- a/drivers/accel/rocket/rocket_device.h +++ b/drivers/accel/rocket/rocket_device.h @@ -11,6 +11,8 @@ struct rocket_device { struct drm_device ddev; + struct mutex sched_lock; + struct clk *clk_npu; struct clk *pclk; diff --git a/drivers/accel/rocket/rocket_drv.c b/drivers/accel/rocket/rocket_drv.c index e5612b52952fa7a0cd0af02aef314984bc483b05..a6b486e2d4f648d7b1d8831590b633bf661c7bc4 100644 --- a/drivers/accel/rocket/rocket_drv.c +++ b/drivers/accel/rocket/rocket_drv.c @@ -16,12 +16,14 @@ #include "rocket_drv.h" #include "rocket_gem.h" +#include "rocket_job.h" static int rocket_open(struct drm_device *dev, struct drm_file *file) { struct rocket_device *rdev = to_rocket_device(dev); struct rocket_file_priv *rocket_priv; + int ret; rocket_priv = kzalloc(sizeof(*rocket_priv), GFP_KERNEL); if (!rocket_priv) @@ -30,7 +32,15 @@ rocket_open(struct drm_device *dev, struct drm_file *file) rocket_priv->rdev = rdev; file->driver_priv = rocket_priv; + ret = rocket_job_open(rocket_priv); + if (ret) + goto err_free; + return 0; + +err_free: + kfree(rocket_priv); + return ret; } static void @@ -38,6 +48,7 @@ rocket_postclose(struct drm_device *dev, struct drm_file *file) { struct rocket_file_priv *rocket_priv = file->driver_priv; + rocket_job_close(rocket_priv); kfree(rocket_priv); } @@ -46,6 +57,7 @@ static const struct drm_ioctl_desc rocket_drm_driver_ioctls[] = { DRM_IOCTL_DEF_DRV(ROCKET_##n, rocket_ioctl_##func, 0) ROCKET_IOCTL(CREATE_BO, create_bo), + ROCKET_IOCTL(SUBMIT, submit), }; DEFINE_DRM_ACCEL_FOPS(rocket_accel_driver_fops); @@ -245,6 +257,9 @@ static int rocket_device_runtime_suspend(struct device *dev) if (dev != rdev->cores[core].dev) continue; + if (!rocket_job_is_idle(&rdev->cores[core])) + return -EBUSY; + clk_disable_unprepare(rdev->cores[core].a_clk); clk_disable_unprepare(rdev->cores[core].h_clk); diff --git a/drivers/accel/rocket/rocket_drv.h b/drivers/accel/rocket/rocket_drv.h index ccdd50c69d4c033eea18cb800407fdcfb3bf2e9b..54e21a61006057aee293496016e54b495a2f6d55 100644 --- a/drivers/accel/rocket/rocket_drv.h +++ b/drivers/accel/rocket/rocket_drv.h @@ -4,10 +4,14 @@ #ifndef __ROCKET_DRV_H__ #define __ROCKET_DRV_H__ +#include <drm/gpu_scheduler.h> + #include "rocket_device.h" struct rocket_file_priv { struct rocket_device *rdev; + + struct drm_sched_entity sched_entity; }; #endif diff --git a/drivers/accel/rocket/rocket_job.c b/drivers/accel/rocket/rocket_job.c new file mode 100644 index 0000000000000000000000000000000000000000..25b31f28e932aaee86173b9a0962932c9c640c03 --- /dev/null +++ b/drivers/accel/rocket/rocket_job.c @@ -0,0 +1,710 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@kernel.org> */ +/* Copyright 2019 Collabora ltd. */ +/* Copyright 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> */ + +#include <drm/drm_file.h> +#include <drm/drm_gem.h> +#include <drm/rocket_accel.h> +#include <linux/interrupt.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> + +#include "rocket_core.h" +#include "rocket_device.h" +#include "rocket_drv.h" +#include "rocket_job.h" +#include "rocket_registers.h" + +#define JOB_TIMEOUT_MS 500 + +#define job_write(dev, reg, data) writel(data, dev->iomem + (reg)) +#define job_read(dev, reg) readl(dev->iomem + (reg)) + +static struct rocket_job * +to_rocket_job(struct drm_sched_job *sched_job) +{ + return container_of(sched_job, struct rocket_job, base); +} + +struct rocket_fence { + struct dma_fence base; + struct drm_device *dev; + /* rocket seqno for signaled() test */ + u64 seqno; + int queue; +}; + +static inline struct rocket_fence * +to_rocket_fence(struct dma_fence *fence) +{ + return (struct rocket_fence *)fence; +} + +static const char *rocket_fence_get_driver_name(struct dma_fence *fence) +{ + return "rocket"; +} + +static const char *rocket_fence_get_timeline_name(struct dma_fence *fence) +{ + return "rockchip-npu"; +} + +static const struct dma_fence_ops rocket_fence_ops = { + .get_driver_name = rocket_fence_get_driver_name, + .get_timeline_name = rocket_fence_get_timeline_name, +}; + +static struct dma_fence *rocket_fence_create(struct rocket_core *core) +{ + struct rocket_device *rdev = core->rdev; + struct rocket_fence *fence; + + fence = kzalloc(sizeof(*fence), GFP_KERNEL); + if (!fence) + return ERR_PTR(-ENOMEM); + + fence->dev = &rdev->ddev; + fence->seqno = ++core->emit_seqno; + dma_fence_init(&fence->base, &rocket_fence_ops, &core->job_lock, + core->fence_context, fence->seqno); + + return &fence->base; +} + +static int +rocket_copy_tasks(struct drm_device *dev, + struct drm_file *file_priv, + struct drm_rocket_job *job, + struct rocket_job *rjob) +{ + struct drm_rocket_task *tasks; + int ret = 0; + int i; + + rjob->task_count = job->task_count; + + if (!rjob->task_count) + return 0; + + tasks = kvmalloc_array(rjob->task_count, sizeof(*tasks), GFP_KERNEL); + if (!tasks) { + ret = -ENOMEM; + DRM_DEBUG("Failed to allocate incoming tasks\n"); + goto fail; + } + + if (copy_from_user(tasks, + (void __user *)(uintptr_t)job->tasks, + rjob->task_count * sizeof(*tasks))) { + ret = -EFAULT; + DRM_DEBUG("Failed to copy incoming tasks\n"); + goto fail; + } + + rjob->tasks = kvmalloc_array(job->task_count, sizeof(*rjob->tasks), GFP_KERNEL); + if (!rjob->tasks) { + DRM_DEBUG("Failed to allocate task array\n"); + ret = -ENOMEM; + goto fail; + } + + for (i = 0; i < rjob->task_count; i++) { + if (tasks[i].regcmd_count == 0) { + ret = -EINVAL; + goto fail; + } + rjob->tasks[i].regcmd = tasks[i].regcmd; + rjob->tasks[i].regcmd_count = tasks[i].regcmd_count; + } + +fail: + kvfree(tasks); + return ret; +} + +static void rocket_job_hw_submit(struct rocket_core *core, struct rocket_job *job) +{ + struct rocket_task *task; + bool task_pp_en = 1; + bool task_count = 1; + + /* GO ! */ + + /* Don't queue the job if a reset is in progress */ + if (!atomic_read(&core->reset.pending)) { + + task = &job->tasks[job->next_task_idx]; + job->next_task_idx++; /* TODO: Do this only after a successful run? */ + + rocket_write(core, REG_PC_BASE_ADDRESS, 0x1); + + rocket_write(core, REG_CNA_S_POINTER, 0xe + 0x10000000 * core->index); + rocket_write(core, REG_CORE_S_POINTER, 0xe + 0x10000000 * core->index); + + rocket_write(core, REG_PC_BASE_ADDRESS, task->regcmd); + rocket_write(core, REG_PC_REGISTER_AMOUNTS, (task->regcmd_count + 1) / 2 - 1); + + rocket_write(core, REG_PC_INTERRUPT_MASK, + PC_INTERRUPT_MASK_DPU_0 | PC_INTERRUPT_MASK_DPU_1); + rocket_write(core, REG_PC_INTERRUPT_CLEAR, + PC_INTERRUPT_CLEAR_DPU_0 | PC_INTERRUPT_CLEAR_DPU_1); + + rocket_write(core, REG_PC_TASK_CON, ((0x6 | task_pp_en) << 12) | task_count); + + rocket_write(core, REG_PC_TASK_DMA_BASE_ADDR, 0x0); + + rocket_write(core, REG_PC_OPERATION_ENABLE, 0x1); + + dev_dbg(core->dev, + "Submitted regcmd at 0x%llx to core %d", + task->regcmd, core->index); + } +} + +static int rocket_acquire_object_fences(struct drm_gem_object **bos, + int bo_count, + struct drm_sched_job *job, + bool is_write) +{ + int i, ret; + + for (i = 0; i < bo_count; i++) { + ret = dma_resv_reserve_fences(bos[i]->resv, 1); + if (ret) + return ret; + + ret = drm_sched_job_add_implicit_dependencies(job, bos[i], + is_write); + if (ret) + return ret; + } + + return 0; +} + +static void rocket_attach_object_fences(struct drm_gem_object **bos, + int bo_count, + struct dma_fence *fence) +{ + int i; + + for (i = 0; i < bo_count; i++) + dma_resv_add_fence(bos[i]->resv, fence, DMA_RESV_USAGE_WRITE); +} + +static int rocket_job_push(struct rocket_job *job) +{ + struct rocket_device *rdev = job->rdev; + struct drm_gem_object **bos; + struct ww_acquire_ctx acquire_ctx; + int ret = 0; + + bos = kvmalloc_array(job->in_bo_count + job->out_bo_count, sizeof(void *), + GFP_KERNEL); + memcpy(bos, job->in_bos, job->in_bo_count * sizeof(void *)); + memcpy(&bos[job->in_bo_count], job->out_bos, job->out_bo_count * sizeof(void *)); + + ret = drm_gem_lock_reservations(bos, job->in_bo_count + job->out_bo_count, &acquire_ctx); + if (ret) + goto err; + + mutex_lock(&rdev->sched_lock); + drm_sched_job_arm(&job->base); + + job->inference_done_fence = dma_fence_get(&job->base.s_fence->finished); + + ret = rocket_acquire_object_fences(job->in_bos, job->in_bo_count, &job->base, false); + if (ret) { + mutex_unlock(&rdev->sched_lock); + goto err_unlock; + } + + ret = rocket_acquire_object_fences(job->out_bos, job->out_bo_count, &job->base, true); + if (ret) { + mutex_unlock(&rdev->sched_lock); + goto err_unlock; + } + + kref_get(&job->refcount); /* put by scheduler job completion */ + + drm_sched_entity_push_job(&job->base); + + mutex_unlock(&rdev->sched_lock); + + rocket_attach_object_fences(job->out_bos, job->out_bo_count, job->inference_done_fence); + +err_unlock: + drm_gem_unlock_reservations(bos, job->in_bo_count + job->out_bo_count, &acquire_ctx); +err: + kfree(bos); + + return ret; +} + +static void rocket_job_cleanup(struct kref *ref) +{ + struct rocket_job *job = container_of(ref, struct rocket_job, + refcount); + unsigned int i; + + dma_fence_put(job->done_fence); + dma_fence_put(job->inference_done_fence); + + if (job->in_bos) { + for (i = 0; i < job->in_bo_count; i++) + drm_gem_object_put(job->in_bos[i]); + + kvfree(job->in_bos); + } + + if (job->out_bos) { + for (i = 0; i < job->out_bo_count; i++) + drm_gem_object_put(job->out_bos[i]); + + kvfree(job->out_bos); + } + + kfree(job->tasks); + + kfree(job); +} + +static void rocket_job_put(struct rocket_job *job) +{ + kref_put(&job->refcount, rocket_job_cleanup); +} + +static void rocket_job_free(struct drm_sched_job *sched_job) +{ + struct rocket_job *job = to_rocket_job(sched_job); + + drm_sched_job_cleanup(sched_job); + + rocket_job_put(job); +} + +static struct rocket_core *sched_to_core(struct rocket_device *rdev, + struct drm_gpu_scheduler *sched) +{ + unsigned int core; + + for (core = 0; core < rdev->num_cores; core++) { + if (&rdev->cores[core].sched == sched) + return &rdev->cores[core]; + } + + return NULL; +} + +static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job) +{ + struct rocket_job *job = to_rocket_job(sched_job); + struct rocket_device *rdev = job->rdev; + struct rocket_core *core = sched_to_core(rdev, sched_job->sched); + struct dma_fence *fence = NULL; + int ret; + + if (unlikely(job->base.s_fence->finished.error)) + return NULL; + + /* + * Nothing to execute: can happen if the job has finished while + * we were resetting the GPU. + */ + if (job->next_task_idx == job->task_count) + return NULL; + + fence = rocket_fence_create(core); + if (IS_ERR(fence)) + return fence; + + if (job->done_fence) + dma_fence_put(job->done_fence); + job->done_fence = dma_fence_get(fence); + + ret = pm_runtime_get_sync(core->dev); + if (ret < 0) + return fence; + + spin_lock(&core->job_lock); + + core->in_flight_job = job; + rocket_job_hw_submit(core, job); + + spin_unlock(&core->job_lock); + + return fence; +} + +static void rocket_job_handle_done(struct rocket_core *core, + struct rocket_job *job) +{ + if (job->next_task_idx < job->task_count) { + rocket_job_hw_submit(core, job); + return; + } + + core->in_flight_job = NULL; + dma_fence_signal_locked(job->done_fence); + pm_runtime_put_autosuspend(core->dev); +} + +static void rocket_job_handle_irq(struct rocket_core *core) +{ + uint32_t status, raw_status; + + pm_runtime_mark_last_busy(core->dev); + + status = rocket_read(core, REG_PC_INTERRUPT_STATUS); + raw_status = rocket_read(core, REG_PC_INTERRUPT_RAW_STATUS); + + rocket_write(core, REG_PC_OPERATION_ENABLE, 0x0); + rocket_write(core, REG_PC_INTERRUPT_CLEAR, 0x1ffff); + + spin_lock(&core->job_lock); + + if (core->in_flight_job) + rocket_job_handle_done(core, core->in_flight_job); + + spin_unlock(&core->job_lock); +} + +static void +rocket_reset(struct rocket_core *core, struct drm_sched_job *bad) +{ + bool cookie; + + if (!atomic_read(&core->reset.pending)) + return; + + /* + * Stop the scheduler. + * + * FIXME: We temporarily get out of the dma_fence_signalling section + * because the cleanup path generate lockdep splats when taking locks + * to release job resources. We should rework the code to follow this + * pattern: + * + * try_lock + * if (locked) + * release + * else + * schedule_work_to_release_later + */ + drm_sched_stop(&core->sched, bad); + + cookie = dma_fence_begin_signalling(); + + if (bad) + drm_sched_increase_karma(bad); + + /* + * Mask job interrupts and synchronize to make sure we won't be + * interrupted during our reset. + */ + rocket_write(core, REG_PC_INTERRUPT_MASK, 0x0); + synchronize_irq(core->irq); + + /* Handle the remaining interrupts before we reset. */ + rocket_job_handle_irq(core); + + /* + * Remaining interrupts have been handled, but we might still have + * stuck jobs. Let's make sure the PM counters stay balanced by + * manually calling pm_runtime_put_noidle() and + * rocket_devfreq_record_idle() for each stuck job. + * Let's also make sure the cycle counting register's refcnt is + * kept balanced to prevent it from running forever + */ + spin_lock(&core->job_lock); + if (core->in_flight_job) + pm_runtime_put_noidle(core->dev); + + core->in_flight_job = NULL; + spin_unlock(&core->job_lock); + + /* Proceed with reset now. */ + pm_runtime_force_suspend(core->dev); + pm_runtime_force_resume(core->dev); + + /* GPU has been reset, we can clear the reset pending bit. */ + atomic_set(&core->reset.pending, 0); + + /* + * Now resubmit jobs that were previously queued but didn't have a + * chance to finish. + * FIXME: We temporarily get out of the DMA fence signalling section + * while resubmitting jobs because the job submission logic will + * allocate memory with the GFP_KERNEL flag which can trigger memory + * reclaim and exposes a lock ordering issue. + */ + dma_fence_end_signalling(cookie); + drm_sched_resubmit_jobs(&core->sched); + cookie = dma_fence_begin_signalling(); + + /* Restart the scheduler */ + drm_sched_start(&core->sched, 0); + + dma_fence_end_signalling(cookie); +} + +static enum drm_gpu_sched_stat rocket_job_timedout(struct drm_sched_job *sched_job) +{ + struct rocket_job *job = to_rocket_job(sched_job); + struct rocket_device *rdev = job->rdev; + struct rocket_core *core = sched_to_core(rdev, sched_job->sched); + + /* + * If the GPU managed to complete this jobs fence, the timeout is + * spurious. Bail out. + */ + if (dma_fence_is_signaled(job->done_fence)) + return DRM_GPU_SCHED_STAT_NOMINAL; + + /* + * Rocket IRQ handler may take a long time to process an interrupt + * if there is another IRQ handler hogging the processing. + * For example, the HDMI encoder driver might be stuck in the IRQ + * handler for a significant time in a case of bad cable connection. + * In order to catch such cases and not report spurious rocket + * job timeouts, synchronize the IRQ handler and re-check the fence + * status. + */ + synchronize_irq(core->irq); + + if (dma_fence_is_signaled(job->done_fence)) { + dev_warn(core->dev, "unexpectedly high interrupt latency\n"); + return DRM_GPU_SCHED_STAT_NOMINAL; + } + + dev_err(core->dev, "gpu sched timeout"); + + atomic_set(&core->reset.pending, 1); + rocket_reset(core, sched_job); + + return DRM_GPU_SCHED_STAT_NOMINAL; +} + +static void rocket_reset_work(struct work_struct *work) +{ + struct rocket_core *core; + + core = container_of(work, struct rocket_core, reset.work); + rocket_reset(core, NULL); +} + +static const struct drm_sched_backend_ops rocket_sched_ops = { + .run_job = rocket_job_run, + .timedout_job = rocket_job_timedout, + .free_job = rocket_job_free +}; + +static irqreturn_t rocket_job_irq_handler_thread(int irq, void *data) +{ + struct rocket_core *core = data; + + rocket_job_handle_irq(core); + + return IRQ_HANDLED; +} + +static irqreturn_t rocket_job_irq_handler(int irq, void *data) +{ + struct rocket_core *core = data; + uint32_t raw_status = rocket_read(core, REG_PC_INTERRUPT_RAW_STATUS); + + WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR); + WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR); + + if (!(raw_status & PC_INTERRUPT_RAW_STATUS_DPU_0 || + raw_status & PC_INTERRUPT_RAW_STATUS_DPU_1)) + return IRQ_NONE; + + rocket_write(core, REG_PC_INTERRUPT_MASK, 0x0); + + return IRQ_WAKE_THREAD; +} + +int rocket_job_init(struct rocket_core *core) +{ + int ret; + + INIT_WORK(&core->reset.work, rocket_reset_work); + spin_lock_init(&core->job_lock); + + core->irq = platform_get_irq(to_platform_device(core->dev), 0); + if (core->irq < 0) + return core->irq; + + ret = devm_request_threaded_irq(core->dev, core->irq, + rocket_job_irq_handler, + rocket_job_irq_handler_thread, + IRQF_SHARED, KBUILD_MODNAME "-job", + core); + if (ret) { + dev_err(core->dev, "failed to request job irq"); + return ret; + } + + core->reset.wq = alloc_ordered_workqueue("rocket-reset-%d", 0, core->index); + if (!core->reset.wq) + return -ENOMEM; + + core->fence_context = dma_fence_context_alloc(1); + + ret = drm_sched_init(&core->sched, + &rocket_sched_ops, NULL, + DRM_SCHED_PRIORITY_COUNT, + 1, 0, + msecs_to_jiffies(JOB_TIMEOUT_MS), + core->reset.wq, + NULL, "rocket", core->dev); + if (ret) { + dev_err(core->dev, "Failed to create scheduler: %d.", ret); + goto err_sched; + } + + return 0; + +err_sched: + drm_sched_fini(&core->sched); + + destroy_workqueue(core->reset.wq); + return ret; +} + +void rocket_job_fini(struct rocket_core *core) +{ + drm_sched_fini(&core->sched); + + cancel_work_sync(&core->reset.work); + destroy_workqueue(core->reset.wq); +} + +int rocket_job_open(struct rocket_file_priv *rocket_priv) +{ + struct rocket_device *rdev = rocket_priv->rdev; + struct drm_gpu_scheduler **scheds = kmalloc_array(rdev->num_cores, sizeof(scheds), + GFP_KERNEL); + unsigned int core; + int ret; + + for (core = 0; core < rdev->num_cores; core++) + scheds[core] = &rdev->cores[core].sched; + + ret = drm_sched_entity_init(&rocket_priv->sched_entity, + DRM_SCHED_PRIORITY_NORMAL, + scheds, + rdev->num_cores, NULL); + if (WARN_ON(ret)) + return ret; + + return 0; +} + +void rocket_job_close(struct rocket_file_priv *rocket_priv) +{ + struct drm_sched_entity *entity = &rocket_priv->sched_entity; + + kfree(entity->sched_list); + drm_sched_entity_destroy(entity); +} + +int rocket_job_is_idle(struct rocket_core *core) +{ + /* If there are any jobs in this HW queue, we're not idle */ + if (atomic_read(&core->sched.credit_count)) + return false; + + return true; +} + +static int rocket_ioctl_submit_job(struct drm_device *dev, struct drm_file *file, + struct drm_rocket_job *job) +{ + struct rocket_device *rdev = to_rocket_device(dev); + struct rocket_file_priv *file_priv = file->driver_priv; + struct rocket_job *rjob = NULL; + int ret = 0; + + if (job->task_count == 0) + return -EINVAL; + + rjob = kzalloc(sizeof(*rjob), GFP_KERNEL); + if (!rjob) + return -ENOMEM; + + kref_init(&rjob->refcount); + + rjob->rdev = rdev; + + ret = drm_sched_job_init(&rjob->base, + &file_priv->sched_entity, + 1, NULL); + if (ret) + goto out_put_job; + + ret = rocket_copy_tasks(dev, file, job, rjob); + if (ret) + goto out_cleanup_job; + + ret = drm_gem_objects_lookup(file, + (void __user *)(uintptr_t)job->in_bo_handles, + job->in_bo_handle_count, &rjob->in_bos); + if (ret) + goto out_cleanup_job; + + rjob->in_bo_count = job->in_bo_handle_count; + + ret = drm_gem_objects_lookup(file, + (void __user *)(uintptr_t)job->out_bo_handles, + job->out_bo_handle_count, &rjob->out_bos); + if (ret) + goto out_cleanup_job; + + rjob->out_bo_count = job->out_bo_handle_count; + + ret = rocket_job_push(rjob); + if (ret) + goto out_cleanup_job; + +out_cleanup_job: + if (ret) + drm_sched_job_cleanup(&rjob->base); +out_put_job: + rocket_job_put(rjob); + + return ret; +} + +int rocket_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file) +{ + struct drm_rocket_submit *args = data; + struct drm_rocket_job *jobs; + int ret = 0; + unsigned int i = 0; + + jobs = kvmalloc_array(args->job_count, sizeof(*jobs), GFP_KERNEL); + if (!jobs) { + DRM_DEBUG("Failed to allocate incoming job array\n"); + return -ENOMEM; + } + + if (copy_from_user(jobs, + (void __user *)(uintptr_t)args->jobs, + args->job_count * sizeof(*jobs))) { + ret = -EFAULT; + DRM_DEBUG("Failed to copy incoming job array\n"); + goto exit; + } + + for (i = 0; i < args->job_count; i++) + rocket_ioctl_submit_job(dev, file, &jobs[i]); + +exit: + kfree(jobs); + + return ret; +} diff --git a/drivers/accel/rocket/rocket_job.h b/drivers/accel/rocket/rocket_job.h new file mode 100644 index 0000000000000000000000000000000000000000..93fa1f988c72adb7a405acbf08c1c9b87d22f9c5 --- /dev/null +++ b/drivers/accel/rocket/rocket_job.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net> */ + +#ifndef __ROCKET_JOB_H__ +#define __ROCKET_JOB_H__ + +#include <drm/drm_drv.h> +#include <drm/gpu_scheduler.h> + +#include "rocket_core.h" +#include "rocket_drv.h" + +struct rocket_task { + u64 regcmd; + u32 regcmd_count; +}; + +struct rocket_job { + struct drm_sched_job base; + + struct rocket_device *rdev; + + struct drm_gem_object **in_bos; + struct drm_gem_object **out_bos; + + u32 in_bo_count; + u32 out_bo_count; + + struct rocket_task *tasks; + u32 task_count; + u32 next_task_idx; + + /* Fence to be signaled by drm-sched once its done with the job */ + struct dma_fence *inference_done_fence; + + /* Fence to be signaled by IRQ handler when the job is complete. */ + struct dma_fence *done_fence; + + struct kref refcount; +}; + +int rocket_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file); + +int rocket_job_init(struct rocket_core *core); +void rocket_job_fini(struct rocket_core *core); +int rocket_job_open(struct rocket_file_priv *rocket_priv); +void rocket_job_close(struct rocket_file_priv *rocket_priv); +int rocket_job_is_idle(struct rocket_core *core); + +#endif diff --git a/include/uapi/drm/rocket_accel.h b/include/uapi/drm/rocket_accel.h index 8338726a83c31b954608ca505cf78bcd70d3494b..eb886351134ebef62969b1e1182ccc174f88fe9d 100644 --- a/include/uapi/drm/rocket_accel.h +++ b/include/uapi/drm/rocket_accel.h @@ -12,8 +12,10 @@ extern "C" { #endif #define DRM_ROCKET_CREATE_BO 0x00 +#define DRM_ROCKET_SUBMIT 0x01 #define DRM_IOCTL_ROCKET_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo) +#define DRM_IOCTL_ROCKET_SUBMIT DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit) /** * struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs. @@ -36,6 +38,59 @@ struct drm_rocket_create_bo { __u64 offset; }; +/** + * struct drm_rocket_task - A task to be run on the NPU + * + * A task is the smallest unit of work that can be run on the NPU. + */ +struct drm_rocket_task { + /** DMA address to NPU mapping of register command buffer */ + __u64 regcmd; + + /** Number of commands in the register command buffer */ + __u32 regcmd_count; +}; + +/** + * struct drm_rocket_job - A job to be run on the NPU + * + * The kernel will schedule the execution of this job taking into account its + * dependencies with other jobs. All tasks in the same job will be executed + * sequentially on the same core, to benefit from memory residency in SRAM. + */ +struct drm_rocket_job { + /** Pointer to an array of struct drm_rocket_task. */ + __u64 tasks; + + /** Pointer to a u32 array of the BOs that are read by the job. */ + __u64 in_bo_handles; + + /** Pointer to a u32 array of the BOs that are written to by the job. */ + __u64 out_bo_handles; + + /** Number of tasks passed in. */ + __u32 task_count; + + /** Number of input BO handles passed in (size is that times 4). */ + __u32 in_bo_handle_count; + + /** Number of output BO handles passed in (size is that times 4). */ + __u32 out_bo_handle_count; +}; + +/** + * struct drm_rocket_submit - ioctl argument for submitting commands to the NPU. + * + * The kernel will schedule the execution of these jobs in dependency order. + */ +struct drm_rocket_submit { + /** Pointer to an array of struct drm_rocket_job. */ + __u64 jobs; + + /** Number of jobs passed in. */ + __u32 job_count; +}; + #if defined(__cplusplus) } #endif
Using the DRM GPU scheduler infrastructure, with a scheduler for each core. Userspace can decide for a series of tasks to be executed sequentially in the same core, so SRAM locality can be taken advantage of. The job submission code was initially based on Panfrost. v2: - Remove hardcoded number of cores - Misc. style fixes (Jeffrey Hugo) - Repack IOCTL struct (Jeffrey Hugo) Signed-off-by: Tomeu Vizoso <tomeu@tomeuvizoso.net> --- drivers/accel/rocket/Makefile | 3 +- drivers/accel/rocket/rocket_core.c | 6 + drivers/accel/rocket/rocket_core.h | 14 + drivers/accel/rocket/rocket_device.c | 2 + drivers/accel/rocket/rocket_device.h | 2 + drivers/accel/rocket/rocket_drv.c | 15 + drivers/accel/rocket/rocket_drv.h | 4 + drivers/accel/rocket/rocket_job.c | 710 +++++++++++++++++++++++++++++++++++ drivers/accel/rocket/rocket_job.h | 50 +++ include/uapi/drm/rocket_accel.h | 55 +++ 10 files changed, 860 insertions(+), 1 deletion(-)