Message ID | 20140709122953.11354.46381.stgit@patser (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> -----Original Message----- > From: Maarten Lankhorst [mailto:maarten.lankhorst@canonical.com] > Sent: Wednesday, July 09, 2014 8:30 AM > To: airlied@linux.ie > Cc: thellstrom@vmware.com; nouveau@lists.freedesktop.org; linux- > kernel@vger.kernel.org; dri-devel@lists.freedesktop.org; > bskeggs@redhat.com; Deucher, Alexander; Koenig, Christian > Subject: [PATCH 09/17] drm/radeon: use common fence implementation for > fences > > Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > --- > drivers/gpu/drm/radeon/radeon.h | 15 +- > drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- > drivers/gpu/drm/radeon/radeon_fence.c | 223 > ++++++++++++++++++++++++++------ > 3 files changed, 248 insertions(+), 50 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/radeon.h > b/drivers/gpu/drm/radeon/radeon.h > index 29d9cc04c04e..03a5567f2c2f 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -64,6 +64,7 @@ > #include <linux/wait.h> > #include <linux/list.h> > #include <linux/kref.h> > +#include <linux/fence.h> > > #include <ttm/ttm_bo_api.h> > #include <ttm/ttm_bo_driver.h> > @@ -116,9 +117,6 @@ extern int radeon_deep_color; > #define RADEONFB_CONN_LIMIT 4 > #define RADEON_BIOS_NUM_SCRATCH 8 > > -/* fence seq are set to this number when signaled */ > -#define RADEON_FENCE_SIGNALED_SEQ 0LL > - > /* internal ring indices */ > /* r1xx+ has gfx CP ring */ > #define RADEON_RING_TYPE_GFX_INDEX 0 > @@ -350,12 +348,15 @@ struct radeon_fence_driver { > }; > > struct radeon_fence { > + struct fence base; > + > struct radeon_device *rdev; > - struct kref kref; > /* protected by radeon_fence.lock */ > uint64_t seq; > /* RB, DMA, etc. */ > unsigned ring; > + > + wait_queue_t fence_wake; > }; > > int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring); > @@ -2268,6 +2269,7 @@ struct radeon_device { > struct radeon_mman mman; > struct radeon_fence_driver fence_drv[RADEON_NUM_RINGS]; > wait_queue_head_t fence_queue; > + unsigned fence_context; > struct mutex ring_lock; > struct radeon_ring ring[RADEON_NUM_RINGS]; > bool ib_pool_ready; > @@ -2358,11 +2360,6 @@ u32 cik_mm_rdoorbell(struct radeon_device > *rdev, u32 index); > void cik_mm_wdoorbell(struct radeon_device *rdev, u32 index, u32 v); > > /* > - * Cast helper > - */ > -#define to_radeon_fence(p) ((struct radeon_fence *)(p)) > - > -/* > * Registers read & write functions. > */ > #define RREG8(reg) readb((rdev->rmmio) + (reg)) > diff --git a/drivers/gpu/drm/radeon/radeon_device.c > b/drivers/gpu/drm/radeon/radeon_device.c > index 03686fab842d..86699df7c8f3 100644 > --- a/drivers/gpu/drm/radeon/radeon_device.c > +++ b/drivers/gpu/drm/radeon/radeon_device.c > @@ -1213,6 +1213,7 @@ int radeon_device_init(struct radeon_device > *rdev, > for (i = 0; i < RADEON_NUM_RINGS; i++) { > rdev->ring[i].idx = i; > } > + rdev->fence_context = > fence_context_alloc(RADEON_NUM_RINGS); > > DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X > 0x%04X:0x%04X).\n", > radeon_family_name[rdev->family], pdev->vendor, pdev- > >device, > @@ -1607,6 +1608,54 @@ int radeon_resume_kms(struct drm_device *dev, > bool resume, bool fbcon) > return 0; > } > > +static uint32_t radeon_gpu_mask_sw_irq(struct radeon_device *rdev) > +{ > + uint32_t mask = 0; > + int i; > + > + if (!rdev->ddev->irq_enabled) > + return mask; > + > + /* > + * increase refcount on sw interrupts for all rings to stop > + * enabling interrupts in radeon_fence_enable_signaling during > + * gpu reset. > + */ > + > + for (i = 0; i < RADEON_NUM_RINGS; ++i) { > + if (!rdev->ring[i].ready) > + continue; > + > + atomic_inc(&rdev->irq.ring_int[i]); > + mask |= 1 << i; > + } > + return mask; > +} > + > +static void radeon_gpu_unmask_sw_irq(struct radeon_device *rdev, > uint32_t mask) > +{ > + unsigned long irqflags; > + int i; > + > + if (!mask) > + return; > + > + /* > + * undo refcount increase, and reset irqs to correct value. > + */ > + > + for (i = 0; i < RADEON_NUM_RINGS; ++i) { > + if (!(mask & (1 << i))) > + continue; > + > + atomic_dec(&rdev->irq.ring_int[i]); > + } > + > + spin_lock_irqsave(&rdev->irq.lock, irqflags); > + radeon_irq_set(rdev); > + spin_unlock_irqrestore(&rdev->irq.lock, irqflags); > +} > + > /** > * radeon_gpu_reset - reset the asic > * > @@ -1624,6 +1673,7 @@ int radeon_gpu_reset(struct radeon_device *rdev) > > int i, r; > int resched; > + uint32_t sw_mask; > > down_write(&rdev->exclusive_lock); > > @@ -1637,6 +1687,7 @@ int radeon_gpu_reset(struct radeon_device *rdev) > radeon_save_bios_scratch_regs(rdev); > /* block TTM */ > resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev); > + sw_mask = radeon_gpu_mask_sw_irq(rdev); > radeon_pm_suspend(rdev); > radeon_suspend(rdev); > > @@ -1686,13 +1737,20 @@ retry: > radeon_pm_resume(rdev); > drm_helper_resume_force_mode(rdev->ddev); > > + radeon_gpu_unmask_sw_irq(rdev, sw_mask); > ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, > resched); > if (r) { > /* bad news, how to tell it to userspace ? */ > dev_info(rdev->dev, "GPU reset failed\n"); > } > > - up_write(&rdev->exclusive_lock); > + /* > + * force all waiters to recheck, some may have been > + * added while the exclusive_lock was unavailable > + */ > + downgrade_write(&rdev->exclusive_lock); > + wake_up_all(&rdev->fence_queue); > + up_read(&rdev->exclusive_lock); > return r; > } > > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c > b/drivers/gpu/drm/radeon/radeon_fence.c > index 6435719fd45b..81c98f6ff0ca 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -39,6 +39,15 @@ > #include "radeon.h" > #include "radeon_trace.h" > > +static const struct fence_ops radeon_fence_ops; > + > +#define to_radeon_fence(p) \ > + ({ \ > + struct radeon_fence *__f; \ > + __f = container_of((p), struct radeon_fence, base); \ > + __f->base.ops == &radeon_fence_ops ? __f : NULL; \ > + }) > + > /* > * Fences > * Fences mark an event in the GPUs pipeline and are used > @@ -111,30 +120,55 @@ int radeon_fence_emit(struct radeon_device > *rdev, > struct radeon_fence **fence, > int ring) > { > + u64 seq = ++rdev->fence_drv[ring].sync_seq[ring]; > + > /* we are protected by the ring emission mutex */ > *fence = kmalloc(sizeof(struct radeon_fence), GFP_KERNEL); > if ((*fence) == NULL) { > return -ENOMEM; > } > - kref_init(&((*fence)->kref)); > - (*fence)->rdev = rdev; > - (*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring]; > (*fence)->ring = ring; > + fence_init(&(*fence)->base, &radeon_fence_ops, > + &rdev->fence_queue.lock, rdev->fence_context + ring, > seq); > + (*fence)->rdev = rdev; > + (*fence)->seq = seq; > radeon_fence_ring_emit(rdev, ring, *fence); > trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq); > return 0; > } > > /** > - * radeon_fence_process - process a fence > + * radeon_fence_check_signaled - callback from fence_queue > * > - * @rdev: radeon_device pointer > - * @ring: ring index the fence is associated with > - * > - * Checks the current fence value and wakes the fence queue > - * if the sequence number has increased (all asics). > + * this function is called with fence_queue lock held, which is also used > + * for the fence locking itself, so unlocked variants are used for > + * fence_signal, and remove_wait_queue. > */ > -void radeon_fence_process(struct radeon_device *rdev, int ring) > +static int radeon_fence_check_signaled(wait_queue_t *wait, unsigned > mode, int flags, void *key) > +{ > + struct radeon_fence *fence; > + u64 seq; > + > + fence = container_of(wait, struct radeon_fence, fence_wake); > + > + seq = atomic64_read(&fence->rdev->fence_drv[fence- > >ring].last_seq); > + if (seq >= fence->seq) { > + int ret = fence_signal_locked(&fence->base); > + > + if (!ret) > + FENCE_TRACE(&fence->base, "signaled from irq > context\n"); > + else > + FENCE_TRACE(&fence->base, "was already > signaled\n"); > + > + radeon_irq_kms_sw_irq_put(fence->rdev, fence->ring); > + __remove_wait_queue(&fence->rdev->fence_queue, > &fence->fence_wake); > + fence_put(&fence->base); > + } else > + FENCE_TRACE(&fence->base, "pending\n"); > + return 0; > +} > + > +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) > { > uint64_t seq, last_seq, last_emitted; > unsigned count_loop = 0; > @@ -190,23 +224,22 @@ void radeon_fence_process(struct radeon_device > *rdev, int ring) > } > } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > > seq); > > - if (wake) > - wake_up_all(&rdev->fence_queue); > + return wake; > } > > /** > - * radeon_fence_destroy - destroy a fence > + * radeon_fence_process - process a fence > * > - * @kref: fence kref > + * @rdev: radeon_device pointer > + * @ring: ring index the fence is associated with > * > - * Frees the fence object (all asics). > + * Checks the current fence value and wakes the fence queue > + * if the sequence number has increased (all asics). > */ > -static void radeon_fence_destroy(struct kref *kref) > +void radeon_fence_process(struct radeon_device *rdev, int ring) > { > - struct radeon_fence *fence; > - > - fence = container_of(kref, struct radeon_fence, kref); > - kfree(fence); > + if (__radeon_fence_process(rdev, ring)) > + wake_up_all(&rdev->fence_queue); > } > > /** > @@ -237,6 +270,69 @@ static bool radeon_fence_seq_signaled(struct > radeon_device *rdev, > return false; > } > > +static bool __radeon_fence_signaled(struct fence *f) > +{ > + struct radeon_fence *fence = to_radeon_fence(f); > + struct radeon_device *rdev = fence->rdev; > + unsigned ring = fence->ring; > + u64 seq = fence->seq; > + > + if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) { > + return true; > + } > + > + if (down_read_trylock(&rdev->exclusive_lock)) { > + radeon_fence_process(rdev, ring); > + up_read(&rdev->exclusive_lock); > + > + if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) > { > + return true; > + } > + } > + return false; > +} > + > +/** > + * radeon_fence_enable_signaling - enable signalling on fence > + * @fence: fence > + * > + * This function is called with fence_queue lock held, and adds a callback > + * to fence_queue that checks if this fence is signaled, and if so it > + * signals the fence and removes itself. > + */ > +static bool radeon_fence_enable_signaling(struct fence *f) > +{ > + struct radeon_fence *fence = to_radeon_fence(f); > + struct radeon_device *rdev = fence->rdev; > + > + if (atomic64_read(&rdev->fence_drv[fence->ring].last_seq) >= > fence->seq || > + !rdev->ddev->irq_enabled) > + return false; > + > + radeon_irq_kms_sw_irq_get(rdev, fence->ring); > + > + if (down_read_trylock(&rdev->exclusive_lock)) { > + if (__radeon_fence_process(rdev, fence->ring)) > + wake_up_all_locked(&rdev->fence_queue); > + > + up_read(&rdev->exclusive_lock); > + } > + > + /* did fence get signaled after we enabled the sw irq? */ > + if (atomic64_read(&rdev->fence_drv[fence->ring].last_seq) >= > fence->seq) { > + radeon_irq_kms_sw_irq_put(rdev, fence->ring); > + return false; > + } > + > + fence->fence_wake.flags = 0; > + fence->fence_wake.private = NULL; > + fence->fence_wake.func = radeon_fence_check_signaled; > + __add_wait_queue(&rdev->fence_queue, &fence->fence_wake); > + fence_get(f); > + > + return true; > +} > + > /** > * radeon_fence_signaled - check if a fence has signaled > * > @@ -250,11 +346,13 @@ bool radeon_fence_signaled(struct radeon_fence > *fence) > if (!fence) { > return true; > } > - if (fence->seq == RADEON_FENCE_SIGNALED_SEQ) { > - return true; > - } > + > if (radeon_fence_seq_signaled(fence->rdev, fence->seq, fence- > >ring)) { > - fence->seq = RADEON_FENCE_SIGNALED_SEQ; > + int ret; > + > + ret = fence_signal(&fence->base); > + if (!ret) > + FENCE_TRACE(&fence->base, "signaled from > radeon_fence_signaled\n"); > return true; > } > return false; > @@ -413,21 +511,18 @@ int radeon_fence_wait(struct radeon_fence > *fence, bool intr) > uint64_t seq[RADEON_NUM_RINGS] = {}; > long r; > > - if (fence == NULL) { > - WARN(1, "Querying an invalid fence : %p !\n", fence); > - return -EINVAL; > - } > - > - seq[fence->ring] = fence->seq; > - if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ) > + if (test_bit(FENCE_FLAG_SIGNALED_BIT, &fence->base.flags)) > return 0; > > + seq[fence->ring] = fence->seq; > r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, > MAX_SCHEDULE_TIMEOUT); > if (r < 0) { > return r; > } > > - fence->seq = RADEON_FENCE_SIGNALED_SEQ; > + r = fence_signal(&fence->base); > + if (!r) > + FENCE_TRACE(&fence->base, "signaled from > fence_wait\n"); > return 0; > } > > @@ -459,12 +554,13 @@ int radeon_fence_wait_any(struct radeon_device > *rdev, > continue; > } > > + if (test_bit(FENCE_FLAG_SIGNALED_BIT, &fences[i]- > >base.flags)) { > + /* already signaled */ > + return 0; > + } > + > seq[i] = fences[i]->seq; > ++num_rings; > - > - /* test if something was allready signaled */ > - if (seq[i] == RADEON_FENCE_SIGNALED_SEQ) > - return 0; > } > > /* nothing to wait for ? */ > @@ -545,7 +641,7 @@ int radeon_fence_wait_empty(struct radeon_device > *rdev, int ring) > */ > struct radeon_fence *radeon_fence_ref(struct radeon_fence *fence) > { > - kref_get(&fence->kref); > + fence_get(&fence->base); > return fence; > } > > @@ -561,9 +657,8 @@ void radeon_fence_unref(struct radeon_fence > **fence) > struct radeon_fence *tmp = *fence; > > *fence = NULL; > - if (tmp) { > - kref_put(&tmp->kref, radeon_fence_destroy); > - } > + if (tmp) > + fence_put(&tmp->base); > } > > /** > @@ -872,3 +967,51 @@ int radeon_debugfs_fence_init(struct > radeon_device *rdev) > return 0; > #endif > } > + > +static long __radeon_fence_wait(struct fence *f, bool intr, long timeout) > +{ > + struct radeon_fence *fence = to_radeon_fence(f); > + u64 target_seq[RADEON_NUM_RINGS] = {}; > + struct radeon_device *rdev = fence->rdev; > + long r; > + > + target_seq[fence->ring] = fence->seq; > + > + down_read(&rdev->exclusive_lock); > + r = radeon_fence_wait_seq_timeout(fence->rdev, target_seq, intr, > timeout); > + > + if (r > 0 && !fence_signal(&fence->base)) > + FENCE_TRACE(&fence->base, "signaled from > __radeon_fence_wait\n"); > + > + up_read(&rdev->exclusive_lock); > + return r; > + > +} > + > +static const char *radeon_fence_get_driver_name(struct fence *fence) > +{ > + return "radeon"; > +} > + > +static const char *radeon_fence_get_timeline_name(struct fence *f) > +{ > + struct radeon_fence *fence = to_radeon_fence(f); > + switch (fence->ring) { > + case RADEON_RING_TYPE_GFX_INDEX: return "radeon.gfx"; > + case CAYMAN_RING_TYPE_CP1_INDEX: return "radeon.cp1"; > + case CAYMAN_RING_TYPE_CP2_INDEX: return "radeon.cp2"; > + case R600_RING_TYPE_DMA_INDEX: return "radeon.dma"; > + case CAYMAN_RING_TYPE_DMA1_INDEX: return "radeon.dma1"; > + case R600_RING_TYPE_UVD_INDEX: return "radeon.uvd"; Radeon supports vce rings on newer ascis. Probably want to add the case for those here too. Alex > + default: WARN_ON_ONCE(1); return "radeon.unk"; > + } > +} > + > +static const struct fence_ops radeon_fence_ops = { > + .get_driver_name = radeon_fence_get_driver_name, > + .get_timeline_name = radeon_fence_get_timeline_name, > + .enable_signaling = radeon_fence_enable_signaling, > + .signaled = __radeon_fence_signaled, > + .wait = __radeon_fence_wait, > + .release = NULL, > +};
On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: > Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > --- > drivers/gpu/drm/radeon/radeon.h | 15 +- > drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- > drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ > 3 files changed, 248 insertions(+), 50 deletions(-) > From what I can see this is still suffering from the problem that we need to find a proper solution to, My summary of the issues after talking to Jerome and Ben and re-reading things is: We really need to work out a better interface into the drivers to be able to avoid random atomic entrypoints, I'm sure you have some ideas and I think you really need to investigate them to move this thing forward, even it if means some issues with android sync pts. but none of the two major drivers seem to want the interface as-is so something needs to give My major question is why we need an atomic callback here at all, what scenario does it cover? Surely we can use a workqueue based callback to ask a driver to check its signalling, is it really that urgent? Dave.
Am 22.07.2014 06:05, schrieb Dave Airlie: > On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >> --- >> drivers/gpu/drm/radeon/radeon.h | 15 +- >> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- >> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ >> 3 files changed, 248 insertions(+), 50 deletions(-) >> > From what I can see this is still suffering from the problem that we > need to find a proper solution to, > > My summary of the issues after talking to Jerome and Ben and > re-reading things is: > > We really need to work out a better interface into the drivers to be > able to avoid random atomic entrypoints, Which is exactly what I criticized from the very first beginning. Good to know that I'm not the only one thinking that this isn't such a good idea. > I'm sure you have some ideas and I think you really need to > investigate them to move this thing forward, > even it if means some issues with android sync pts. Actually I think that TTMs fence interface already gave quite a good hint how it might look like. I can only guess that this won't fit with the Android stuff, otherwise I can't see a good reason why we didn't stick with that. > but none of the two major drivers seem to want the interface as-is so > something needs to give > > My major question is why we need an atomic callback here at all, what > scenario does it cover? Agree totally. As far as I can see all current uses of the interface are of the kind of waiting for a fence to signal. No need for any callback from one driver into another, especially not in atomic context. If a driver needs such a functionality it should just start up a kernel thread and do it's waiting there. This obviously shouldn't be an obstacle for pure hardware implementations where one driver signals a semaphore another driver is waiting for, or a high signal on an interrupt line directly wired between two chips. And I think this is a completely different topic and not necessarily part of the common fence interface we should currently focus on. Christian. > Surely we can use a workqueue based callback to ask a driver to check > its signalling, is it really > that urgent? > > Dave.
On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: > Am 22.07.2014 06:05, schrieb Dave Airlie: > >On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: > >>Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > >>--- > >> drivers/gpu/drm/radeon/radeon.h | 15 +- > >> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- > >> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ > >> 3 files changed, 248 insertions(+), 50 deletions(-) > >> > > From what I can see this is still suffering from the problem that we > >need to find a proper solution to, > > > >My summary of the issues after talking to Jerome and Ben and > >re-reading things is: > > > >We really need to work out a better interface into the drivers to be > >able to avoid random atomic entrypoints, > > Which is exactly what I criticized from the very first beginning. Good to > know that I'm not the only one thinking that this isn't such a good idea. I guess I've lost context a bit, but which atomic entry point are we talking about? Afaics the only one that's mandatory is the is fence->signaled callback to check whether a fence really has been signalled. It's used internally by the fence code to avoid spurious wakeups. Afaik that should be doable already on any hardware. If that's not the case then we can always track the signalled state in software and double-check in a worker thread before updating the sw state. And wrap this all up into a special fence class if there's more than one driver needing this. There is nothing else that forces callbacks from atomic contexts upon you. You can use them if you see it fit, but really if it doesn't suit your driver you can just ignore that part and do process based waits everywhere. > >I'm sure you have some ideas and I think you really need to > >investigate them to move this thing forward, > >even it if means some issues with android sync pts. > > Actually I think that TTMs fence interface already gave quite a good hint > how it might look like. I can only guess that this won't fit with the > Android stuff, otherwise I can't see a good reason why we didn't stick with > that. Well the current plan for i915<->radeon sync from Maarten is to use these atomic callbacks on the i915 side. So android didn't figure into this at all. Actually with android the entire implementation is kinda the platforms problem, the generic parts just give you a userspace interface and some means to stack up fences. > >but none of the two major drivers seem to want the interface as-is so > >something needs to give > > > >My major question is why we need an atomic callback here at all, what > >scenario does it cover? > > Agree totally. As far as I can see all current uses of the interface are of > the kind of waiting for a fence to signal. > > No need for any callback from one driver into another, especially not in > atomic context. If a driver needs such a functionality it should just start > up a kernel thread and do it's waiting there. > > This obviously shouldn't be an obstacle for pure hardware implementations > where one driver signals a semaphore another driver is waiting for, or a > high signal on an interrupt line directly wired between two chips. And I > think this is a completely different topic and not necessarily part of the > common fence interface we should currently focus on. It's for mixed hw/sw stuff where we want to poke the hw from the irq context (if possible) since someone forgot the wire. At least on the i915 side it boils down to one mmio write, and it's fairly pointless to launch a thread for that. So I haven't dug into ttm details but from the i915 side the current stuff and atomic semantics makes sense. Maybe we just need to wrap a bit more insulation around ttm-based drivers. -Daniel
Hey, op 22-07-14 06:05, Dave Airlie schreef: > On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >> --- >> drivers/gpu/drm/radeon/radeon.h | 15 +- >> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- >> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ >> 3 files changed, 248 insertions(+), 50 deletions(-) >> > From what I can see this is still suffering from the problem that we > need to find a proper solution to, > > My summary of the issues after talking to Jerome and Ben and > re-reading things is: > > We really need to work out a better interface into the drivers to be > able to avoid random atomic entrypoints, > I'm sure you have some ideas and I think you really need to > investigate them to move this thing forward, > even it if means some issues with android sync pts. > > but none of the two major drivers seem to want the interface as-is so > something needs to give wait_queue_t (which radeon uses for fence_queue) uses atomic entrypoints too, the most common one being autoremove_wake_function, which wakes up the thread it was initialized from, and removes itself from the wait_queue_t list, in atomic fashion. It's used by __wait_event_interruptible_locked, if something internally wants to add some arbitrary callback it could already happen... > My major question is why we need an atomic callback here at all, what > scenario does it cover? A atomic callback could do something like schedule_work(&work) (like nouveau_fence_work already does right now!!!!). I've also added some more experimental things in my unsubmitted branch, in a codepath that's taken when synchronization is used with multiple GPU's: Nouveau: I write the new seqno to the GART fence, which I added a GPU wait for using SEMAPHORE_TRIGGER.ACQUIRE_GE. radeon: I write to a memory location to unblock the execution ring, this will probably be replaced by a call to the GPU scheduler. i915: write to the EXCC (condition code) register to unblock the ring operation when it's waiting for the condition code. But I want to emphasize that this is a hack, and driver maintainers will probably NACK it, I think I will only submit the one for nouveau, which is sane there because it schedules contexts in hardware. Even so that part is not final and will probably go through a few iterations before submission. > Surely we can use a workqueue based callback to ask a driver to check > its signalling, is it really > that urgent? Nothing prevents a driver from using that approach, even with those changes. Driver maintainers can still NACK the use of fence_add_callback if they want to, or choose not to export fences to outside the driver. Because fences are still not exporting, nothing will change for them compared to the current situation. ~Maarten
On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: > On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: > > Am 22.07.2014 06:05, schrieb Dave Airlie: > > >On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: > > >>Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > > >>--- > > >> drivers/gpu/drm/radeon/radeon.h | 15 +- > > >> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- > > >> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ > > >> 3 files changed, 248 insertions(+), 50 deletions(-) > > >> > > > From what I can see this is still suffering from the problem that we > > >need to find a proper solution to, > > > > > >My summary of the issues after talking to Jerome and Ben and > > >re-reading things is: > > > > > >We really need to work out a better interface into the drivers to be > > >able to avoid random atomic entrypoints, > > > > Which is exactly what I criticized from the very first beginning. Good to > > know that I'm not the only one thinking that this isn't such a good idea. > > I guess I've lost context a bit, but which atomic entry point are we > talking about? Afaics the only one that's mandatory is the is > fence->signaled callback to check whether a fence really has been > signalled. It's used internally by the fence code to avoid spurious > wakeups. Afaik that should be doable already on any hardware. If that's > not the case then we can always track the signalled state in software and > double-check in a worker thread before updating the sw state. And wrap > this all up into a special fence class if there's more than one driver > needing this. > > There is nothing else that forces callbacks from atomic contexts upon you. > You can use them if you see it fit, but really if it doesn't suit your > driver you can just ignore that part and do process based waits > everywhere. Aside: The fence-process-callback has already been implemented by nouveau with the struct fence_work in nouveau_fence.c. Would make loads of sense to move that code into the driver core and adapat it to Maarten's struct fence once this has all landed. -Daniel
On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: > On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: > > Am 22.07.2014 06:05, schrieb Dave Airlie: > > >On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: > > >>Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > > >>--- > > >> drivers/gpu/drm/radeon/radeon.h | 15 +- > > >> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- > > >> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ > > >> 3 files changed, 248 insertions(+), 50 deletions(-) > > >> > > > From what I can see this is still suffering from the problem that we > > >need to find a proper solution to, > > > > > >My summary of the issues after talking to Jerome and Ben and > > >re-reading things is: > > > > > >We really need to work out a better interface into the drivers to be > > >able to avoid random atomic entrypoints, > > > > Which is exactly what I criticized from the very first beginning. Good to > > know that I'm not the only one thinking that this isn't such a good idea. > > I guess I've lost context a bit, but which atomic entry point are we > talking about? Afaics the only one that's mandatory is the is > fence->signaled callback to check whether a fence really has been > signalled. It's used internally by the fence code to avoid spurious > wakeups. Afaik that should be doable already on any hardware. If that's > not the case then we can always track the signalled state in software and > double-check in a worker thread before updating the sw state. And wrap > this all up into a special fence class if there's more than one driver > needing this. One thing I've forgotten: The i915 scheduler that's floating around runs its bottom half from irq context. So I really want to be able to check fence state from irq context and I also want to make it possible (possible! not mandatory) to register callbacks which are run from any context asap after the fence is signalled. If the radeon hw/driver doesn't want to cope with that complexity we can fully insolate it with the sw tracked fence state if you don't like Maarten's radeon implementation. But forcing everyone to forgoe this just because you don't like it and don't want to use it in radeon doesn't sound right. -Daniel
Am 22.07.2014 13:57, schrieb Daniel Vetter: > On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: >> On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: >>> Am 22.07.2014 06:05, schrieb Dave Airlie: >>>> On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >>>>> --- >>>>> drivers/gpu/drm/radeon/radeon.h | 15 +- >>>>> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- >>>>> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ >>>>> 3 files changed, 248 insertions(+), 50 deletions(-) >>>>> >>>> From what I can see this is still suffering from the problem that we >>>> need to find a proper solution to, >>>> >>>> My summary of the issues after talking to Jerome and Ben and >>>> re-reading things is: >>>> >>>> We really need to work out a better interface into the drivers to be >>>> able to avoid random atomic entrypoints, >>> Which is exactly what I criticized from the very first beginning. Good to >>> know that I'm not the only one thinking that this isn't such a good idea. >> I guess I've lost context a bit, but which atomic entry point are we >> talking about? Afaics the only one that's mandatory is the is >> fence->signaled callback to check whether a fence really has been >> signalled. It's used internally by the fence code to avoid spurious >> wakeups. Afaik that should be doable already on any hardware. If that's >> not the case then we can always track the signalled state in software and >> double-check in a worker thread before updating the sw state. And wrap >> this all up into a special fence class if there's more than one driver >> needing this. > One thing I've forgotten: The i915 scheduler that's floating around runs > its bottom half from irq context. So I really want to be able to check > fence state from irq context and I also want to make it possible > (possible! not mandatory) to register callbacks which are run from any > context asap after the fence is signalled. NAK, that's just the bad design I've talked about. Checking fence state inside the same driver from interrupt context is OK, because it's the drivers interrupt that we are talking about here. Checking fence status from another drivers interrupt context is what really concerns me here, cause your driver doesn't have the slightest idea if the called driver is really capable of checking the fence right now. > If the radeon hw/driver doesn't want to cope with that complexity we can > fully insolate it with the sw tracked fence state if you don't like > Maarten's radeon implementation. But forcing everyone to forgoe this just > because you don't like it and don't want to use it in radeon doesn't sound > right. While it's clearly a hack Maarten's solution for radeon would indeed work, but that's not really the point here. It's just that I think leaking interrupt context from one driver into another driver is just a really really bad idea from a design point of view. And calling into a driver while in atomic context to check for a fence being signaled doesn't sounds like a good idea either, cause that limits way to much what the called driver can do for checking the status of a fence. Christian.
On Tue, Jul 22, 2014 at 02:19:57PM +0200, Christian König wrote: > Am 22.07.2014 13:57, schrieb Daniel Vetter: > >On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: > >>On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: > >>>Am 22.07.2014 06:05, schrieb Dave Airlie: > >>>>On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: > >>>>>Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> > >>>>>--- > >>>>> drivers/gpu/drm/radeon/radeon.h | 15 +- > >>>>> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- > >>>>> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ > >>>>> 3 files changed, 248 insertions(+), 50 deletions(-) > >>>>> > >>>> From what I can see this is still suffering from the problem that we > >>>>need to find a proper solution to, > >>>> > >>>>My summary of the issues after talking to Jerome and Ben and > >>>>re-reading things is: > >>>> > >>>>We really need to work out a better interface into the drivers to be > >>>>able to avoid random atomic entrypoints, > >>>Which is exactly what I criticized from the very first beginning. Good to > >>>know that I'm not the only one thinking that this isn't such a good idea. > >>I guess I've lost context a bit, but which atomic entry point are we > >>talking about? Afaics the only one that's mandatory is the is > >>fence->signaled callback to check whether a fence really has been > >>signalled. It's used internally by the fence code to avoid spurious > >>wakeups. Afaik that should be doable already on any hardware. If that's > >>not the case then we can always track the signalled state in software and > >>double-check in a worker thread before updating the sw state. And wrap > >>this all up into a special fence class if there's more than one driver > >>needing this. > >One thing I've forgotten: The i915 scheduler that's floating around runs > >its bottom half from irq context. So I really want to be able to check > >fence state from irq context and I also want to make it possible > >(possible! not mandatory) to register callbacks which are run from any > >context asap after the fence is signalled. > > NAK, that's just the bad design I've talked about. Checking fence state > inside the same driver from interrupt context is OK, because it's the > drivers interrupt that we are talking about here. > > Checking fence status from another drivers interrupt context is what really > concerns me here, cause your driver doesn't have the slightest idea if the > called driver is really capable of checking the fence right now. I guess my mail hasn't been clear then. If you don't like it we could add a bit of glue to insulate the madness and bad design i915 might do from radeon. That imo doesn't invalidate the overall fence interfaces. So what about the following: - fence->enabling_signaling is restricted to be called from process context. We don't use any different yet, so would boild down to adding a WARN_ON(in_interrupt) or so to fence_enable_sw_signalling. - Make fence->signaled optional (already the case) and don't implement it in readon (i.e. reduce this patch here). Only downside is that radeon needs to correctly (i.e. without races or so) call fence_signal. And the cross-driver synchronization might be a bit less efficient. Note that you can call fence_signal from wherever you want to, so hopefully that doesn't restrict your implementation. End result: No one calls into radeon from interrupt context, and this is guaranteed. Would that be something you can agree to? Like I've said I think restricting the insanity other people are willing to live with just because you don't like it isn't right. But it is certainly right for you to insist on not being forced into any such design. I think the above would achieve this. -Daniel
Am 22.07.2014 15:26, schrieb Daniel Vetter: > On Tue, Jul 22, 2014 at 02:19:57PM +0200, Christian König wrote: >> Am 22.07.2014 13:57, schrieb Daniel Vetter: >>> On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: >>>> On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: >>>>> Am 22.07.2014 06:05, schrieb Dave Airlie: >>>>>> On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >>>>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >>>>>>> --- >>>>>>> drivers/gpu/drm/radeon/radeon.h | 15 +- >>>>>>> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- >>>>>>> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ >>>>>>> 3 files changed, 248 insertions(+), 50 deletions(-) >>>>>>> >>>>>> From what I can see this is still suffering from the problem that we >>>>>> need to find a proper solution to, >>>>>> >>>>>> My summary of the issues after talking to Jerome and Ben and >>>>>> re-reading things is: >>>>>> >>>>>> We really need to work out a better interface into the drivers to be >>>>>> able to avoid random atomic entrypoints, >>>>> Which is exactly what I criticized from the very first beginning. Good to >>>>> know that I'm not the only one thinking that this isn't such a good idea. >>>> I guess I've lost context a bit, but which atomic entry point are we >>>> talking about? Afaics the only one that's mandatory is the is >>>> fence->signaled callback to check whether a fence really has been >>>> signalled. It's used internally by the fence code to avoid spurious >>>> wakeups. Afaik that should be doable already on any hardware. If that's >>>> not the case then we can always track the signalled state in software and >>>> double-check in a worker thread before updating the sw state. And wrap >>>> this all up into a special fence class if there's more than one driver >>>> needing this. >>> One thing I've forgotten: The i915 scheduler that's floating around runs >>> its bottom half from irq context. So I really want to be able to check >>> fence state from irq context and I also want to make it possible >>> (possible! not mandatory) to register callbacks which are run from any >>> context asap after the fence is signalled. >> NAK, that's just the bad design I've talked about. Checking fence state >> inside the same driver from interrupt context is OK, because it's the >> drivers interrupt that we are talking about here. >> >> Checking fence status from another drivers interrupt context is what really >> concerns me here, cause your driver doesn't have the slightest idea if the >> called driver is really capable of checking the fence right now. > I guess my mail hasn't been clear then. If you don't like it we could add > a bit of glue to insulate the madness and bad design i915 might do from > radeon. That imo doesn't invalidate the overall fence interfaces. > > So what about the following: > - fence->enabling_signaling is restricted to be called from process > context. We don't use any different yet, so would boild down to adding a > WARN_ON(in_interrupt) or so to fence_enable_sw_signalling. > > - Make fence->signaled optional (already the case) and don't implement it > in readon (i.e. reduce this patch here). Only downside is that radeon > needs to correctly (i.e. without races or so) call fence_signal. And the > cross-driver synchronization might be a bit less efficient. Note that > you can call fence_signal from wherever you want to, so hopefully that > doesn't restrict your implementation. > > End result: No one calls into radeon from interrupt context, and this is > guaranteed. > > Would that be something you can agree to? No, the whole enable_signaling stuff should go away. No callback from the driver into the fence code, only the other way around. fence->signaled as well as fence->wait should become mandatory and only called from process context without holding any locks, neither atomic nor any mutex/semaphore (rcu might be ok). > Like I've said I think restricting the insanity other people are willing > to live with just because you don't like it isn't right. But it is > certainly right for you to insist on not being forced into any such > design. I think the above would achieve this. I don't think so. If it's just me I would say that I'm just to cautious and the idea is still save to apply to the whole kernel. But since Dave, Jerome and Ben seems to have similar concerns I think we need to agree to a minimum and save interface for all drivers. Christian.
op 22-07-14 15:45, Christian König schreef: > Am 22.07.2014 15:26, schrieb Daniel Vetter: >> On Tue, Jul 22, 2014 at 02:19:57PM +0200, Christian König wrote: >>> Am 22.07.2014 13:57, schrieb Daniel Vetter: >>>> On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: >>>>> On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: >>>>>> Am 22.07.2014 06:05, schrieb Dave Airlie: >>>>>>> On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >>>>>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >>>>>>>> --- >>>>>>>> drivers/gpu/drm/radeon/radeon.h | 15 +- >>>>>>>> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- >>>>>>>> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ >>>>>>>> 3 files changed, 248 insertions(+), 50 deletions(-) >>>>>>>> >>>>>>> From what I can see this is still suffering from the problem that we >>>>>>> need to find a proper solution to, >>>>>>> >>>>>>> My summary of the issues after talking to Jerome and Ben and >>>>>>> re-reading things is: >>>>>>> >>>>>>> We really need to work out a better interface into the drivers to be >>>>>>> able to avoid random atomic entrypoints, >>>>>> Which is exactly what I criticized from the very first beginning. Good to >>>>>> know that I'm not the only one thinking that this isn't such a good idea. >>>>> I guess I've lost context a bit, but which atomic entry point are we >>>>> talking about? Afaics the only one that's mandatory is the is >>>>> fence->signaled callback to check whether a fence really has been >>>>> signalled. It's used internally by the fence code to avoid spurious >>>>> wakeups. Afaik that should be doable already on any hardware. If that's >>>>> not the case then we can always track the signalled state in software and >>>>> double-check in a worker thread before updating the sw state. And wrap >>>>> this all up into a special fence class if there's more than one driver >>>>> needing this. >>>> One thing I've forgotten: The i915 scheduler that's floating around runs >>>> its bottom half from irq context. So I really want to be able to check >>>> fence state from irq context and I also want to make it possible >>>> (possible! not mandatory) to register callbacks which are run from any >>>> context asap after the fence is signalled. >>> NAK, that's just the bad design I've talked about. Checking fence state >>> inside the same driver from interrupt context is OK, because it's the >>> drivers interrupt that we are talking about here. >>> >>> Checking fence status from another drivers interrupt context is what really >>> concerns me here, cause your driver doesn't have the slightest idea if the >>> called driver is really capable of checking the fence right now. >> I guess my mail hasn't been clear then. If you don't like it we could add >> a bit of glue to insulate the madness and bad design i915 might do from >> radeon. That imo doesn't invalidate the overall fence interfaces. >> >> So what about the following: >> - fence->enabling_signaling is restricted to be called from process >> context. We don't use any different yet, so would boild down to adding a >> WARN_ON(in_interrupt) or so to fence_enable_sw_signalling. >> >> - Make fence->signaled optional (already the case) and don't implement it >> in readon (i.e. reduce this patch here). Only downside is that radeon >> needs to correctly (i.e. without races or so) call fence_signal. And the >> cross-driver synchronization might be a bit less efficient. Note that >> you can call fence_signal from wherever you want to, so hopefully that >> doesn't restrict your implementation. >> >> End result: No one calls into radeon from interrupt context, and this is >> guaranteed. >> >> Would that be something you can agree to? > > No, the whole enable_signaling stuff should go away. No callback from the driver into the fence code, only the other way around. > > fence->signaled as well as fence->wait should become mandatory and only called from process context without holding any locks, neither atomic nor any mutex/semaphore (rcu might be ok). fence->wait is mandatory, and already requires sleeping. If .signaled is not implemented there is no guarantee the fence will be signaled sometime soon, this is also why enable_signaling exists, to allow the driver to flush. I get it that it doesn't apply to radeon and nouveau, but for other drivers that could be necessary, like vmwgfx. Ironically that is also a part of the ttm fence, except it was called flush there. I would also like to note that ttm_bo_wait currently is also a function that currently uses is_signaled from atomic_context... For the more complicated locking worries: Lockdep is your friend, use PROVE_LOCKING and find bugs before they trigger. ;-) >> Like I've said I think restricting the insanity other people are willing >> to live with just because you don't like it isn't right. But it is >> certainly right for you to insist on not being forced into any such >> design. I think the above would achieve this. > > I don't think so. If it's just me I would say that I'm just to cautious and the idea is still save to apply to the whole kernel. > > But since Dave, Jerome and Ben seems to have similar concerns I think we need to agree to a minimum and save interface for all drivers. > > Christian. >
Am 22.07.2014 16:44, schrieb Maarten Lankhorst: > op 22-07-14 15:45, Christian König schreef: >> Am 22.07.2014 15:26, schrieb Daniel Vetter: >>> On Tue, Jul 22, 2014 at 02:19:57PM +0200, Christian König wrote: >>>> Am 22.07.2014 13:57, schrieb Daniel Vetter: >>>>> On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: >>>>>> On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: >>>>>>> Am 22.07.2014 06:05, schrieb Dave Airlie: >>>>>>>> On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >>>>>>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >>>>>>>>> --- >>>>>>>>> drivers/gpu/drm/radeon/radeon.h | 15 +- >>>>>>>>> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- >>>>>>>>> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ >>>>>>>>> 3 files changed, 248 insertions(+), 50 deletions(-) >>>>>>>>> >>>>>>>> From what I can see this is still suffering from the problem that we >>>>>>>> need to find a proper solution to, >>>>>>>> >>>>>>>> My summary of the issues after talking to Jerome and Ben and >>>>>>>> re-reading things is: >>>>>>>> >>>>>>>> We really need to work out a better interface into the drivers to be >>>>>>>> able to avoid random atomic entrypoints, >>>>>>> Which is exactly what I criticized from the very first beginning. Good to >>>>>>> know that I'm not the only one thinking that this isn't such a good idea. >>>>>> I guess I've lost context a bit, but which atomic entry point are we >>>>>> talking about? Afaics the only one that's mandatory is the is >>>>>> fence->signaled callback to check whether a fence really has been >>>>>> signalled. It's used internally by the fence code to avoid spurious >>>>>> wakeups. Afaik that should be doable already on any hardware. If that's >>>>>> not the case then we can always track the signalled state in software and >>>>>> double-check in a worker thread before updating the sw state. And wrap >>>>>> this all up into a special fence class if there's more than one driver >>>>>> needing this. >>>>> One thing I've forgotten: The i915 scheduler that's floating around runs >>>>> its bottom half from irq context. So I really want to be able to check >>>>> fence state from irq context and I also want to make it possible >>>>> (possible! not mandatory) to register callbacks which are run from any >>>>> context asap after the fence is signalled. >>>> NAK, that's just the bad design I've talked about. Checking fence state >>>> inside the same driver from interrupt context is OK, because it's the >>>> drivers interrupt that we are talking about here. >>>> >>>> Checking fence status from another drivers interrupt context is what really >>>> concerns me here, cause your driver doesn't have the slightest idea if the >>>> called driver is really capable of checking the fence right now. >>> I guess my mail hasn't been clear then. If you don't like it we could add >>> a bit of glue to insulate the madness and bad design i915 might do from >>> radeon. That imo doesn't invalidate the overall fence interfaces. >>> >>> So what about the following: >>> - fence->enabling_signaling is restricted to be called from process >>> context. We don't use any different yet, so would boild down to adding a >>> WARN_ON(in_interrupt) or so to fence_enable_sw_signalling. >>> >>> - Make fence->signaled optional (already the case) and don't implement it >>> in readon (i.e. reduce this patch here). Only downside is that radeon >>> needs to correctly (i.e. without races or so) call fence_signal. And the >>> cross-driver synchronization might be a bit less efficient. Note that >>> you can call fence_signal from wherever you want to, so hopefully that >>> doesn't restrict your implementation. >>> >>> End result: No one calls into radeon from interrupt context, and this is >>> guaranteed. >>> >>> Would that be something you can agree to? >> No, the whole enable_signaling stuff should go away. No callback from the driver into the fence code, only the other way around. >> >> fence->signaled as well as fence->wait should become mandatory and only called from process context without holding any locks, neither atomic nor any mutex/semaphore (rcu might be ok). > fence->wait is mandatory, and already requires sleeping. > > If .signaled is not implemented there is no guarantee the fence will be > signaled sometime soon, this is also why enable_signaling exists, to > allow the driver to flush. I get it that it doesn't apply to radeon and nouveau, > but for other drivers that could be necessary, like vmwgfx. > > Ironically that is also a part of the ttm fence, except it was called flush there. Then call it flush again and make it optional like in TTM. > I would also like to note that ttm_bo_wait currently is also a function that currently uses is_signaled from atomic_context... I know, but TTM is only called from inside a single driver, no inter driver needs here. We currently even call the internal fence implementation from interrupt context as well and at more than one occasion assume that TTM only uses radeon fences. Christian. > For the more complicated locking worries: Lockdep is your friend, use PROVE_LOCKING and find bugs before they trigger. ;-) > >>> Like I've said I think restricting the insanity other people are willing >>> to live with just because you don't like it isn't right. But it is >>> certainly right for you to insist on not being forced into any such >>> design. I think the above would achieve this. >> I don't think so. If it's just me I would say that I'm just to cautious and the idea is still save to apply to the whole kernel. >> >> But since Dave, Jerome and Ben seems to have similar concerns I think we need to agree to a minimum and save interface for all drivers. >> >> Christian. >>
On Tue, Jul 22, 2014 at 3:45 PM, Christian König <deathsimple@vodafone.de> wrote: >> Would that be something you can agree to? > > > No, the whole enable_signaling stuff should go away. No callback from the > driver into the fence code, only the other way around. > > fence->signaled as well as fence->wait should become mandatory and only > called from process context without holding any locks, neither atomic nor > any mutex/semaphore (rcu might be ok). So for the enable_signaling, that's optional already. It's only for drivers that don't want to keep interrupts enabled all the time. You can opt out of that easily. Wrt holding no locks at all while calling into any fence functions, that's just not going to work out. The point here is to make different drivers work together and we can rework all the ttm and i915 code to work locklessly in all cases where they need to wait for someone to complete rendering. Or at least I don't think that's feasible. So if you insist that no one might call into radeon code then we simply need to exclude radeon from participating in any shared fencing. But that's a bit pointless. >> Like I've said I think restricting the insanity other people are willing >> to live with just because you don't like it isn't right. But it is >> certainly right for you to insist on not being forced into any such >> design. I think the above would achieve this. > > > I don't think so. If it's just me I would say that I'm just to cautious and > the idea is still save to apply to the whole kernel. > > But since Dave, Jerome and Ben seems to have similar concerns I think we > need to agree to a minimum and save interface for all drivers. Well I haven't yet seen a proposal that actually works. From an intel pov I don't care that much since we don't care about desktop prime, so if radeon/nouveau don't want to do that, meh. Imo the design as-is is fairly sound, and as simple as it can get given the requirements. I haven't heard an argument convincing me otherwise, so I guess we won't have prime support on linux that actually works, ever. -Daniel
Hey, op 22-07-14 17:02, Christian König schreef: > Am 22.07.2014 16:44, schrieb Maarten Lankhorst: >> op 22-07-14 15:45, Christian König schreef: >>> Am 22.07.2014 15:26, schrieb Daniel Vetter: >>>> On Tue, Jul 22, 2014 at 02:19:57PM +0200, Christian König wrote: >>>>> Am 22.07.2014 13:57, schrieb Daniel Vetter: >>>>>> On Tue, Jul 22, 2014 at 01:46:07PM +0200, Daniel Vetter wrote: >>>>>>> On Tue, Jul 22, 2014 at 10:43:13AM +0200, Christian König wrote: >>>>>>>> Am 22.07.2014 06:05, schrieb Dave Airlie: >>>>>>>>> On 9 July 2014 22:29, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >>>>>>>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> >>>>>>>>>> --- >>>>>>>>>> drivers/gpu/drm/radeon/radeon.h | 15 +- >>>>>>>>>> drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- >>>>>>>>>> drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ >>>>>>>>>> 3 files changed, 248 insertions(+), 50 deletions(-) >>>>>>>>>> >>>>>>>>> From what I can see this is still suffering from the problem that we >>>>>>>>> need to find a proper solution to, >>>>>>>>> >>>>>>>>> My summary of the issues after talking to Jerome and Ben and >>>>>>>>> re-reading things is: >>>>>>>>> >>>>>>>>> We really need to work out a better interface into the drivers to be >>>>>>>>> able to avoid random atomic entrypoints, >>>>>>>> Which is exactly what I criticized from the very first beginning. Good to >>>>>>>> know that I'm not the only one thinking that this isn't such a good idea. >>>>>>> I guess I've lost context a bit, but which atomic entry point are we >>>>>>> talking about? Afaics the only one that's mandatory is the is >>>>>>> fence->signaled callback to check whether a fence really has been >>>>>>> signalled. It's used internally by the fence code to avoid spurious >>>>>>> wakeups. Afaik that should be doable already on any hardware. If that's >>>>>>> not the case then we can always track the signalled state in software and >>>>>>> double-check in a worker thread before updating the sw state. And wrap >>>>>>> this all up into a special fence class if there's more than one driver >>>>>>> needing this. >>>>>> One thing I've forgotten: The i915 scheduler that's floating around runs >>>>>> its bottom half from irq context. So I really want to be able to check >>>>>> fence state from irq context and I also want to make it possible >>>>>> (possible! not mandatory) to register callbacks which are run from any >>>>>> context asap after the fence is signalled. >>>>> NAK, that's just the bad design I've talked about. Checking fence state >>>>> inside the same driver from interrupt context is OK, because it's the >>>>> drivers interrupt that we are talking about here. >>>>> >>>>> Checking fence status from another drivers interrupt context is what really >>>>> concerns me here, cause your driver doesn't have the slightest idea if the >>>>> called driver is really capable of checking the fence right now. >>>> I guess my mail hasn't been clear then. If you don't like it we could add >>>> a bit of glue to insulate the madness and bad design i915 might do from >>>> radeon. That imo doesn't invalidate the overall fence interfaces. >>>> >>>> So what about the following: >>>> - fence->enabling_signaling is restricted to be called from process >>>> context. We don't use any different yet, so would boild down to adding a >>>> WARN_ON(in_interrupt) or so to fence_enable_sw_signalling. >>>> >>>> - Make fence->signaled optional (already the case) and don't implement it >>>> in readon (i.e. reduce this patch here). Only downside is that radeon >>>> needs to correctly (i.e. without races or so) call fence_signal. And the >>>> cross-driver synchronization might be a bit less efficient. Note that >>>> you can call fence_signal from wherever you want to, so hopefully that >>>> doesn't restrict your implementation. >>>> >>>> End result: No one calls into radeon from interrupt context, and this is >>>> guaranteed. >>>> >>>> Would that be something you can agree to? >>> No, the whole enable_signaling stuff should go away. No callback from the driver into the fence code, only the other way around. >>> >>> fence->signaled as well as fence->wait should become mandatory and only called from process context without holding any locks, neither atomic nor any mutex/semaphore (rcu might be ok). >> fence->wait is mandatory, and already requires sleeping. >> >> If .signaled is not implemented there is no guarantee the fence will be >> signaled sometime soon, this is also why enable_signaling exists, to >> allow the driver to flush. I get it that it doesn't apply to radeon and nouveau, >> but for other drivers that could be necessary, like vmwgfx. >> >> Ironically that is also a part of the ttm fence, except it was called flush there. > > Then call it flush again and make it optional like in TTM. You've posted a lot of concerns, but I haven't seen you come up with any scenario that could create a lockup and lockdep wouldn't warn about. >> I would also like to note that ttm_bo_wait currently is also a function that currently uses is_signaled from atomic_context... > > I know, but TTM is only called from inside a single driver, no inter driver needs here. We currently even call the internal fence implementation from interrupt context as well and at more than one occasion assume that TTM only uses radeon fences. This is no longer true when you start synchronizing with other drivers. The TTM core will see the intel fences and treat them no differently. That's the entire reason for this conversion. It's also needed to remove the need to pin dma-buf when exporting. ~Maarten.
Am 22.07.2014 17:17, schrieb Daniel Vetter: > On Tue, Jul 22, 2014 at 3:45 PM, Christian König > <deathsimple@vodafone.de> wrote: >>> Would that be something you can agree to? >> >> No, the whole enable_signaling stuff should go away. No callback from the >> driver into the fence code, only the other way around. >> >> fence->signaled as well as fence->wait should become mandatory and only >> called from process context without holding any locks, neither atomic nor >> any mutex/semaphore (rcu might be ok). > So for the enable_signaling, that's optional already. It's only for > drivers that don't want to keep interrupts enabled all the time. You > can opt out of that easily. > > Wrt holding no locks at all while calling into any fence functions, > that's just not going to work out. The point here is to make different > drivers work together and we can rework all the ttm and i915 code to > work locklessly in all cases where they need to wait for someone to > complete rendering. Or at least I don't think that's feasible. So if > you insist that no one might call into radeon code then we simply need > to exclude radeon from participating in any shared fencing. But that's > a bit pointless. > >>> Like I've said I think restricting the insanity other people are willing >>> to live with just because you don't like it isn't right. But it is >>> certainly right for you to insist on not being forced into any such >>> design. I think the above would achieve this. >> >> I don't think so. If it's just me I would say that I'm just to cautious and >> the idea is still save to apply to the whole kernel. >> >> But since Dave, Jerome and Ben seems to have similar concerns I think we >> need to agree to a minimum and save interface for all drivers. > Well I haven't yet seen a proposal that actually works. How about this: Drivers exporting fences need to provide a fence->signaled and a fence->wait function, everything else like fence->enable_signaling or calling fence_signaled() from the driver is optional. Drivers wanting to use exported fences don't call fence->signaled or fence->wait in atomic or interrupt context, and not with holding any global locking primitives (like mmap_sem etc...). Holding locking primitives local to the driver is ok, as long as they don't conflict with anything possible used by their own fence implementation. Christian. > From an intel > pov I don't care that much since we don't care about desktop prime, so > if radeon/nouveau don't want to do that, meh. Imo the design as-is is > fairly sound, and as simple as it can get given the requirements. I > haven't heard an argument convincing me otherwise, so I guess we > won't have prime support on linux that actually works, ever. > -Daniel
On Tue, Jul 22, 2014 at 5:35 PM, Christian König <christian.koenig@amd.com> wrote: > Drivers exporting fences need to provide a fence->signaled and a fence->wait > function, everything else like fence->enable_signaling or calling > fence_signaled() from the driver is optional. > > Drivers wanting to use exported fences don't call fence->signaled or > fence->wait in atomic or interrupt context, and not with holding any global > locking primitives (like mmap_sem etc...). Holding locking primitives local > to the driver is ok, as long as they don't conflict with anything possible > used by their own fence implementation. Well that's almost what we have right now with the exception that drivers are allowed (actually must for correctness when updating fences) the ww_mutexes for dma-bufs (or other buffer objects). Locking correctness is enforced with some extremely nasty lockdep annotations + additional debugging infrastructure enabled with CONFIG_DEBUG_WW_MUTEX_SLOWPATH. We really need to be able to hold dma-buf ww_mutexes while updating fences or waiting for them. And obviously for ->wait we need non-atomic context, not just non-interrupt. Agreed that any shared locks are out of the way (especially stuff like dev->struct_mutex or other non-strictly driver-private stuff, i915 is really bad here still). So from the core fence framework I think we already have exactly this, and we only need to adjust the radeon implementation a bit to make it less risky and invasive to the radeon driver logic. -Daniel
Am 22.07.2014 17:42, schrieb Daniel Vetter: > On Tue, Jul 22, 2014 at 5:35 PM, Christian König > <christian.koenig@amd.com> wrote: >> Drivers exporting fences need to provide a fence->signaled and a fence->wait >> function, everything else like fence->enable_signaling or calling >> fence_signaled() from the driver is optional. >> >> Drivers wanting to use exported fences don't call fence->signaled or >> fence->wait in atomic or interrupt context, and not with holding any global >> locking primitives (like mmap_sem etc...). Holding locking primitives local >> to the driver is ok, as long as they don't conflict with anything possible >> used by their own fence implementation. > Well that's almost what we have right now with the exception that > drivers are allowed (actually must for correctness when updating > fences) the ww_mutexes for dma-bufs (or other buffer objects). In this case sorry for so much noise. I really haven't looked in so much detail into anything but Maarten's Radeon patches. But how does that then work right now? My impression was that it's mandatory for drivers to call fence_signaled()? > Locking > correctness is enforced with some extremely nasty lockdep annotations > + additional debugging infrastructure enabled with > CONFIG_DEBUG_WW_MUTEX_SLOWPATH. We really need to be able to hold > dma-buf ww_mutexes while updating fences or waiting for them. And > obviously for ->wait we need non-atomic context, not just > non-interrupt. Sounds mostly reasonable, but for holding the dma-buf ww_mutex, wouldn't be an RCU be more appropriate here? E.g. aren't we just interested that the current assigned fence at some point is signaled? Something like grab ww_mutexes, grab a reference to the current fence object, release ww_mutex, wait for fence, release reference to the fence object. > Agreed that any shared locks are out of the way (especially stuff like > dev->struct_mutex or other non-strictly driver-private stuff, i915 is > really bad here still). Yeah that's also an point I've wanted to note on Maartens patch. Radeon grabs the read side of it's exclusive semaphore while waiting for fences (because it assumes that the fence it waits for is a Radeon fence). Assuming that we need to wait in both directions with Prime (e.g. Intel driver needs to wait for Radeon to finish rendering and Radeon needs to wait for Intel to finish displaying), this might become a perfect example of locking inversion. > So from the core fence framework I think we already have exactly this, > and we only need to adjust the radeon implementation a bit to make it > less risky and invasive to the radeon driver logic. Agree. Well the biggest problem I see is that exclusive semaphore I need to take when anything calls into the driver. For the fence code I need to move that down into the fence->signaled handler, cause that now can be called from outside the driver. Maarten solved this by telling the driver in the lockup handler (where we grab the write side of the exclusive lock) that all interrupts are already enabled, so that fence->signaled hopefully wouldn't mess with the hardware at all. While this probably works, it just leaves me with a feeling that we are doing something wrong here. Christian. > -Daniel
On Tue, Jul 22, 2014 at 5:59 PM, Christian König <deathsimple@vodafone.de> wrote: > Am 22.07.2014 17:42, schrieb Daniel Vetter: > >> On Tue, Jul 22, 2014 at 5:35 PM, Christian König >> <christian.koenig@amd.com> wrote: >>> >>> Drivers exporting fences need to provide a fence->signaled and a >>> fence->wait >>> function, everything else like fence->enable_signaling or calling >>> fence_signaled() from the driver is optional. >>> >>> Drivers wanting to use exported fences don't call fence->signaled or >>> fence->wait in atomic or interrupt context, and not with holding any >>> global >>> locking primitives (like mmap_sem etc...). Holding locking primitives >>> local >>> to the driver is ok, as long as they don't conflict with anything >>> possible >>> used by their own fence implementation. >> >> Well that's almost what we have right now with the exception that >> drivers are allowed (actually must for correctness when updating >> fences) the ww_mutexes for dma-bufs (or other buffer objects). > > > In this case sorry for so much noise. I really haven't looked in so much > detail into anything but Maarten's Radeon patches. > > But how does that then work right now? My impression was that it's mandatory > for drivers to call fence_signaled()? Maybe I've mixed things up a bit in my description. There is fence_signal which the implementor/exporter of a fence must call when the fence is completed. If the exporter has an ->enable_signaling callback it can delay that call to fence_signal for as long as it wishes as long as enable_signaling isn't called yet. But that's just the optimization to not required irqs to be turned on all the time. The other function is fence_is_signaled, which is used by code that is interested in the fence state, together with fence_wait if it wants to block and not just wants to know the momentary fence state. All the other functions (the stuff that adds callbacks and the various _locked and other versions) are just for fancy special cases. >> Locking >> correctness is enforced with some extremely nasty lockdep annotations >> + additional debugging infrastructure enabled with >> CONFIG_DEBUG_WW_MUTEX_SLOWPATH. We really need to be able to hold >> dma-buf ww_mutexes while updating fences or waiting for them. And >> obviously for ->wait we need non-atomic context, not just >> non-interrupt. > > > Sounds mostly reasonable, but for holding the dma-buf ww_mutex, wouldn't be > an RCU be more appropriate here? E.g. aren't we just interested that the > current assigned fence at some point is signaled? Yeah, as an optimization you can get the set of currently attached fences to a dma-buf with just rcu. But if you update the set of fences attached to a dma-buf (e.g. radeon blits the newly rendered frame to a dma-buf exported by i915 for scanout on i915) then you need a write lock on that buffer. Which is what the ww_mutex is for, to make sure that you don't deadlock with i915 doing concurrent ops on the same underlying buffer. > Something like grab ww_mutexes, grab a reference to the current fence > object, release ww_mutex, wait for fence, release reference to the fence > object. Yeah, if the only thing you want to do is wait for fences, then the rcu-protected fence ref grabbing + lockless waiting is all you need. But e.g. in an execbuf you also need to update fences and maybe deep down in the reservation code you notice that you need to evict some stuff and so need to wait on some other guy to finish, and it's too complicated to drop and reacquire all the locks. Or you simply need to do a blocking wait on other gpus (because there's no direct hw sync mechanism) and again dropping locks would needlessly complicate the code. So I think we should allow this just to avoid too hairy/brittle (and almost definitely little tested code) in drivers. Afaik this is also the same way ttm currently handles things wrt buffer reservation and eviction. >> Agreed that any shared locks are out of the way (especially stuff like >> dev->struct_mutex or other non-strictly driver-private stuff, i915 is >> really bad here still). > > > Yeah that's also an point I've wanted to note on Maartens patch. Radeon > grabs the read side of it's exclusive semaphore while waiting for fences > (because it assumes that the fence it waits for is a Radeon fence). > > Assuming that we need to wait in both directions with Prime (e.g. Intel > driver needs to wait for Radeon to finish rendering and Radeon needs to wait > for Intel to finish displaying), this might become a perfect example of > locking inversion. fence updates are atomic on a dma-buf, protected by ww_mutex. The neat trick of ww_mutex is that they enforce a global ordering, so in your scenario either i915 or radeon would be first and you can't deadlock. There is no way to interleave anything even if you have lots of buffers shared between i915/radeon. Wrt deadlocking it's exactly the same guarantees as the magic ttm provides for just one driver with concurrent command submission since it's the same idea. >> So from the core fence framework I think we already have exactly this, >> and we only need to adjust the radeon implementation a bit to make it >> less risky and invasive to the radeon driver logic. > > > Agree. Well the biggest problem I see is that exclusive semaphore I need to > take when anything calls into the driver. For the fence code I need to move > that down into the fence->signaled handler, cause that now can be called > from outside the driver. > > Maarten solved this by telling the driver in the lockup handler (where we > grab the write side of the exclusive lock) that all interrupts are already > enabled, so that fence->signaled hopefully wouldn't mess with the hardware > at all. While this probably works, it just leaves me with a feeling that we > are doing something wrong here. I'm not versed on the details in readon, but on i915 we can attach a memory location and cookie value to each fence and just do a memory fetch to figure out whether the fence has passed or not. So no locking needed at all. Of course the fence itself needs to lock a reference onto that memory location, which is a neat piece of integration work that we still need to tackle in some cases - there's conflicting patch series all over this ;-) But like I've said fence->signaled is optional so you don't need this necessarily, as long as radeon eventually calls fence_signaled once the fence has completed. -Daniel
> Maybe I've mixed things up a bit in my description. There is > fence_signal which the implementor/exporter of a fence must call when > the fence is completed. If the exporter has an ->enable_signaling > callback it can delay that call to fence_signal for as long as it > wishes as long as enable_signaling isn't called yet. But that's just > the optimization to not required irqs to be turned on all the time. > > The other function is fence_is_signaled, which is used by code that is > interested in the fence state, together with fence_wait if it wants to > block and not just wants to know the momentary fence state. All the > other functions (the stuff that adds callbacks and the various _locked > and other versions) are just for fancy special cases. Well that's rather bad, cause IRQs aren't reliable enough on Radeon HW for such a thing. Especially on Prime systems and Macs. That's why we have this fancy HZ/2 timeout on all fence wait operations to manually check if the fence is signaled or not. To guarantee that a fence is signaled after enable_signaling is called we would need to fire up a kernel thread which periodically calls fence->signaled. Christian. Am 22.07.2014 18:21, schrieb Daniel Vetter: > On Tue, Jul 22, 2014 at 5:59 PM, Christian König > <deathsimple@vodafone.de> wrote: >> Am 22.07.2014 17:42, schrieb Daniel Vetter: >> >>> On Tue, Jul 22, 2014 at 5:35 PM, Christian König >>> <christian.koenig@amd.com> wrote: >>>> Drivers exporting fences need to provide a fence->signaled and a >>>> fence->wait >>>> function, everything else like fence->enable_signaling or calling >>>> fence_signaled() from the driver is optional. >>>> >>>> Drivers wanting to use exported fences don't call fence->signaled or >>>> fence->wait in atomic or interrupt context, and not with holding any >>>> global >>>> locking primitives (like mmap_sem etc...). Holding locking primitives >>>> local >>>> to the driver is ok, as long as they don't conflict with anything >>>> possible >>>> used by their own fence implementation. >>> Well that's almost what we have right now with the exception that >>> drivers are allowed (actually must for correctness when updating >>> fences) the ww_mutexes for dma-bufs (or other buffer objects). >> >> In this case sorry for so much noise. I really haven't looked in so much >> detail into anything but Maarten's Radeon patches. >> >> But how does that then work right now? My impression was that it's mandatory >> for drivers to call fence_signaled()? > Maybe I've mixed things up a bit in my description. There is > fence_signal which the implementor/exporter of a fence must call when > the fence is completed. If the exporter has an ->enable_signaling > callback it can delay that call to fence_signal for as long as it > wishes as long as enable_signaling isn't called yet. But that's just > the optimization to not required irqs to be turned on all the time. > > The other function is fence_is_signaled, which is used by code that is > interested in the fence state, together with fence_wait if it wants to > block and not just wants to know the momentary fence state. All the > other functions (the stuff that adds callbacks and the various _locked > and other versions) are just for fancy special cases. > >>> Locking >>> correctness is enforced with some extremely nasty lockdep annotations >>> + additional debugging infrastructure enabled with >>> CONFIG_DEBUG_WW_MUTEX_SLOWPATH. We really need to be able to hold >>> dma-buf ww_mutexes while updating fences or waiting for them. And >>> obviously for ->wait we need non-atomic context, not just >>> non-interrupt. >> >> Sounds mostly reasonable, but for holding the dma-buf ww_mutex, wouldn't be >> an RCU be more appropriate here? E.g. aren't we just interested that the >> current assigned fence at some point is signaled? > Yeah, as an optimization you can get the set of currently attached > fences to a dma-buf with just rcu. But if you update the set of fences > attached to a dma-buf (e.g. radeon blits the newly rendered frame to a > dma-buf exported by i915 for scanout on i915) then you need a write > lock on that buffer. Which is what the ww_mutex is for, to make sure > that you don't deadlock with i915 doing concurrent ops on the same > underlying buffer. > >> Something like grab ww_mutexes, grab a reference to the current fence >> object, release ww_mutex, wait for fence, release reference to the fence >> object. > Yeah, if the only thing you want to do is wait for fences, then the > rcu-protected fence ref grabbing + lockless waiting is all you need. > But e.g. in an execbuf you also need to update fences and maybe deep > down in the reservation code you notice that you need to evict some > stuff and so need to wait on some other guy to finish, and it's too > complicated to drop and reacquire all the locks. Or you simply need to > do a blocking wait on other gpus (because there's no direct hw sync > mechanism) and again dropping locks would needlessly complicate the > code. So I think we should allow this just to avoid too hairy/brittle > (and almost definitely little tested code) in drivers. > > Afaik this is also the same way ttm currently handles things wrt > buffer reservation and eviction. > >>> Agreed that any shared locks are out of the way (especially stuff like >>> dev->struct_mutex or other non-strictly driver-private stuff, i915 is >>> really bad here still). >> >> Yeah that's also an point I've wanted to note on Maartens patch. Radeon >> grabs the read side of it's exclusive semaphore while waiting for fences >> (because it assumes that the fence it waits for is a Radeon fence). >> >> Assuming that we need to wait in both directions with Prime (e.g. Intel >> driver needs to wait for Radeon to finish rendering and Radeon needs to wait >> for Intel to finish displaying), this might become a perfect example of >> locking inversion. > fence updates are atomic on a dma-buf, protected by ww_mutex. The neat > trick of ww_mutex is that they enforce a global ordering, so in your > scenario either i915 or radeon would be first and you can't deadlock. > There is no way to interleave anything even if you have lots of > buffers shared between i915/radeon. Wrt deadlocking it's exactly the > same guarantees as the magic ttm provides for just one driver with > concurrent command submission since it's the same idea. > >>> So from the core fence framework I think we already have exactly this, >>> and we only need to adjust the radeon implementation a bit to make it >>> less risky and invasive to the radeon driver logic. >> >> Agree. Well the biggest problem I see is that exclusive semaphore I need to >> take when anything calls into the driver. For the fence code I need to move >> that down into the fence->signaled handler, cause that now can be called >> from outside the driver. >> >> Maarten solved this by telling the driver in the lockup handler (where we >> grab the write side of the exclusive lock) that all interrupts are already >> enabled, so that fence->signaled hopefully wouldn't mess with the hardware >> at all. While this probably works, it just leaves me with a feeling that we >> are doing something wrong here. > I'm not versed on the details in readon, but on i915 we can attach a > memory location and cookie value to each fence and just do a memory > fetch to figure out whether the fence has passed or not. So no locking > needed at all. Of course the fence itself needs to lock a reference > onto that memory location, which is a neat piece of integration work > that we still need to tackle in some cases - there's conflicting patch > series all over this ;-) > > But like I've said fence->signaled is optional so you don't need this > necessarily, as long as radeon eventually calls fence_signaled once > the fence has completed. > -Daniel
On Tue, Jul 22, 2014 at 6:21 PM, Daniel Vetter <daniel.vetter@ffwll.ch> wrote: > But like I've said fence->signaled is optional so you don't need this > necessarily, as long as radeon eventually calls fence_signaled once > the fence has completed. Actually I've chatted a bit with Maarten about the different ways we could restrict both the calling context and the implementations for fence callbacks to avoid surprises. One is certainyl that we need WARN_ON(in_interrupt) around the wait, enable_singaling and add callback stuff. But we also talked about ensure that the ->signaled callback never sleeps by wrapping it in a preempt_enable/disable section. Currently sleeping is allowed in ->signaled, which the radeon implementation here does. I think it would be reasonable to forbid this and drop __radeon_fence_signaled. -Daniel
On Tue, Jul 22, 2014 at 6:39 PM, Christian König <christian.koenig@amd.com> wrote: >> Maybe I've mixed things up a bit in my description. There is >> fence_signal which the implementor/exporter of a fence must call when >> the fence is completed. If the exporter has an ->enable_signaling >> callback it can delay that call to fence_signal for as long as it >> wishes as long as enable_signaling isn't called yet. But that's just >> the optimization to not required irqs to be turned on all the time. >> >> The other function is fence_is_signaled, which is used by code that is >> interested in the fence state, together with fence_wait if it wants to >> block and not just wants to know the momentary fence state. All the >> other functions (the stuff that adds callbacks and the various _locked >> and other versions) are just for fancy special cases. > > Well that's rather bad, cause IRQs aren't reliable enough on Radeon HW for > such a thing. Especially on Prime systems and Macs. > > That's why we have this fancy HZ/2 timeout on all fence wait operations to > manually check if the fence is signaled or not. > > To guarantee that a fence is signaled after enable_signaling is called we > would need to fire up a kernel thread which periodically calls > fence->signaled. We actually have seen similar fun on some i915 platforms. I wonder whether we shouldn't have something in the fence core for this given how common it is. Currently we have the same trick with regular wakups on platforms with unreliable interrupts, but I haven't yet looked at how we'll do this with callbacks once we add the scheduler and fences. It might be though that we've finally fixed these coherency issues between the interrupt and the fence write for real. -Daniel
op 22-07-14 17:59, Christian König schreef: > Am 22.07.2014 17:42, schrieb Daniel Vetter: >> On Tue, Jul 22, 2014 at 5:35 PM, Christian König >> <christian.koenig@amd.com> wrote: >>> Drivers exporting fences need to provide a fence->signaled and a fence->wait >>> function, everything else like fence->enable_signaling or calling >>> fence_signaled() from the driver is optional. >>> >>> Drivers wanting to use exported fences don't call fence->signaled or >>> fence->wait in atomic or interrupt context, and not with holding any global >>> locking primitives (like mmap_sem etc...). Holding locking primitives local >>> to the driver is ok, as long as they don't conflict with anything possible >>> used by their own fence implementation. >> Well that's almost what we have right now with the exception that >> drivers are allowed (actually must for correctness when updating >> fences) the ww_mutexes for dma-bufs (or other buffer objects). > > In this case sorry for so much noise. I really haven't looked in so much detail into anything but Maarten's Radeon patches. > > But how does that then work right now? My impression was that it's mandatory for drivers to call fence_signaled()? It's only mandatory to call fence_signal() if the .enable_signaling callback has been called, else you can get away with never calling signaling a fence at all before dropping the last refcount to it. This allows you to keep interrupts disabled when you don't need them. >> Locking >> correctness is enforced with some extremely nasty lockdep annotations >> + additional debugging infrastructure enabled with >> CONFIG_DEBUG_WW_MUTEX_SLOWPATH. We really need to be able to hold >> dma-buf ww_mutexes while updating fences or waiting for them. And >> obviously for ->wait we need non-atomic context, not just >> non-interrupt. > > Sounds mostly reasonable, but for holding the dma-buf ww_mutex, wouldn't be an RCU be more appropriate here? E.g. aren't we just interested that the current assigned fence at some point is signaled? You can wait with RCU, without holding the ww_mutex, by calling reservation_object_wait_timeout_rcu on ttm_bo->resv. If you don't want to block you could test with reservation_object_test_signaled_rcu. Or if you want a copy of all fences without taking locks, try reservation_object_get_fences_rcu. (Might be out of date by the time the function returns if you don't hold ww_mutex, if you hold ww_mutex you probably don't need to call this function.) I didn't add non-rcu versions, but using the RCU functions would work with ww_mutex held too, probably with some small overhead. > Something like grab ww_mutexes, grab a reference to the current fence object, release ww_mutex, wait for fence, release reference to the fence object. This is what I do currently. :-) The reservation_object that's embedded in TTM gets shared with the dma-buf, so there will be no special case needed for dma-buf at all, all objects can simply be shared and the synchronization is handled in the same way. ttm_bo_reserve and friends automatically end up locking the dma-buf too, and lockdep works on it. > >> Agreed that any shared locks are out of the way (especially stuff like >> dev->struct_mutex or other non-strictly driver-private stuff, i915 is >> really bad here still). > > Yeah that's also an point I've wanted to note on Maartens patch. Radeon grabs the read side of it's exclusive semaphore while waiting for fences (because it assumes that the fence it waits for is a Radeon fence). > > Assuming that we need to wait in both directions with Prime (e.g. Intel driver needs to wait for Radeon to finish rendering and Radeon needs to wait for Intel to finish displaying), this might become a perfect example of locking inversion. In the preliminary patches where I can sync radeon with other GPU's I've been very careful in all the places that call into fences, to make sure that radeon wouldn't try to handle lockups for a different (possibly also radeon) card. This is also why fence_is_signaled should never block, and why it trylocks the exclusive_lock. :-) I think lockdep would complain if I grabbed exclusive_lock while blocking in is_signaled. >> So from the core fence framework I think we already have exactly this, >> and we only need to adjust the radeon implementation a bit to make it >> less risky and invasive to the radeon driver logic. > > Agree. Well the biggest problem I see is that exclusive semaphore I need to take when anything calls into the driver. For the fence code I need to move that down into the fence->signaled handler, cause that now can be called from outside the driver. > > Maarten solved this by telling the driver in the lockup handler (where we grab the write side of the exclusive lock) that all interrupts are already enabled, so that fence->signaled hopefully wouldn't mess with the hardware at all. While this probably works, it just leaves me with a feeling that we are doing something wrong here. There is unfortunately no global mechanism to say 'this card is locked up, please don't call into any of my fences', and I don't associate fences with devices, and radeon doesn't keep a global list of fences. If all of that existed, it would complicate the interface and its callers a lot, while I like to keep things simple. So I did the best I could, and simply prevented the fence calls from fiddling with the hardware. Fortunately gpu lockup is not a common operation. :-) ~Maarten
Am 23.07.2014 08:40, schrieb Maarten Lankhorst: > op 22-07-14 17:59, Christian König schreef: >> Am 22.07.2014 17:42, schrieb Daniel Vetter: >>> On Tue, Jul 22, 2014 at 5:35 PM, Christian König >>> <christian.koenig@amd.com> wrote: >>>> Drivers exporting fences need to provide a fence->signaled and a fence->wait >>>> function, everything else like fence->enable_signaling or calling >>>> fence_signaled() from the driver is optional. >>>> >>>> Drivers wanting to use exported fences don't call fence->signaled or >>>> fence->wait in atomic or interrupt context, and not with holding any global >>>> locking primitives (like mmap_sem etc...). Holding locking primitives local >>>> to the driver is ok, as long as they don't conflict with anything possible >>>> used by their own fence implementation. >>> Well that's almost what we have right now with the exception that >>> drivers are allowed (actually must for correctness when updating >>> fences) the ww_mutexes for dma-bufs (or other buffer objects). >> In this case sorry for so much noise. I really haven't looked in so much detail into anything but Maarten's Radeon patches. >> >> But how does that then work right now? My impression was that it's mandatory for drivers to call fence_signaled()? > It's only mandatory to call fence_signal() if the .enable_signaling callback has been called, else you can get away with never calling signaling a fence at all before dropping the last refcount to it. > This allows you to keep interrupts disabled when you don't need them. Can we somehow avoid the need to call fence_signal() at all? The interrupts at least on radeon are way to unreliable for such a thing. Can enable_signalling fail? What's the reason for fence_signaled() in the first place? >>> Agreed that any shared locks are out of the way (especially stuff like >>> dev->struct_mutex or other non-strictly driver-private stuff, i915 is >>> really bad here still). >> Yeah that's also an point I've wanted to note on Maartens patch. Radeon grabs the read side of it's exclusive semaphore while waiting for fences (because it assumes that the fence it waits for is a Radeon fence). >> >> Assuming that we need to wait in both directions with Prime (e.g. Intel driver needs to wait for Radeon to finish rendering and Radeon needs to wait for Intel to finish displaying), this might become a perfect example of locking inversion. > In the preliminary patches where I can sync radeon with other GPU's I've been very careful in all the places that call into fences, to make sure that radeon wouldn't try to handle lockups for a different (possibly also radeon) card. That's actually not such a good idea. In case of a lockup we need to handle the lockup cause otherwise it could happen that radeon waits for the lockup to be resolved and the lockup handling needs to wait for a fence that's never signaled because of the lockup. Christian. > > This is also why fence_is_signaled should never block, and why it trylocks the exclusive_lock. :-) I think lockdep would complain if I grabbed exclusive_lock while blocking in is_signaled. > >>> So from the core fence framework I think we already have exactly this, >>> and we only need to adjust the radeon implementation a bit to make it >>> less risky and invasive to the radeon driver logic. >> Agree. Well the biggest problem I see is that exclusive semaphore I need to take when anything calls into the driver. For the fence code I need to move that down into the fence->signaled handler, cause that now can be called from outside the driver. >> >> Maarten solved this by telling the driver in the lockup handler (where we grab the write side of the exclusive lock) that all interrupts are already enabled, so that fence->signaled hopefully wouldn't mess with the hardware at all. While this probably works, it just leaves me with a feeling that we are doing something wrong here. > There is unfortunately no global mechanism to say 'this card is locked up, please don't call into any of my fences', and I don't associate fences with devices, and radeon doesn't keep a global list of fences. > If all of that existed, it would complicate the interface and its callers a lot, while I like to keep things simple. > So I did the best I could, and simply prevented the fence calls from fiddling with the hardware. Fortunately gpu lockup is not a common operation. :-) > > ~Maarten > >
On Wed, Jul 23, 2014 at 8:52 AM, Christian König <christian.koenig@amd.com> wrote: >> In the preliminary patches where I can sync radeon with other GPU's I've >> been very careful in all the places that call into fences, to make sure that >> radeon wouldn't try to handle lockups for a different (possibly also radeon) >> card. > > That's actually not such a good idea. > > In case of a lockup we need to handle the lockup cause otherwise it could > happen that radeon waits for the lockup to be resolved and the lockup > handling needs to wait for a fence that's never signaled because of the > lockup. I thought the plan for now is that each driver handles lookups themselfs for now. So if any batch gets stuck for too long (whether it's our own gpu that's stuck or whether we're somehow stuck on a fence from a 2nd gpu doesn't matter) the driver steps in with a reset and signals completion to all its own fences that have been in that pile-up. As long as each driver participating in fencing has means to abort/reset we'll eventually get unstuck. Essentially every driver has to guarantee that assuming dependent fences all complete eventually that it _will_ complete its own fences no matter what. For now this should be good enough, but for arb_robusteness or people who care a bit about their compute results we need reliable notification to userspace that a reset happened. I think we could add a new "aborted" fence state for that case and then propagate that. But given how tricky the code to compute reset victims in i915 is already I think we should leave this out for now. And even later on make it strictly opt-in. -Daniel
op 23-07-14 08:52, Christian König schreef: > Am 23.07.2014 08:40, schrieb Maarten Lankhorst: >> op 22-07-14 17:59, Christian König schreef: >>> Am 22.07.2014 17:42, schrieb Daniel Vetter: >>>> On Tue, Jul 22, 2014 at 5:35 PM, Christian König >>>> <christian.koenig@amd.com> wrote: >>>>> Drivers exporting fences need to provide a fence->signaled and a fence->wait >>>>> function, everything else like fence->enable_signaling or calling >>>>> fence_signaled() from the driver is optional. >>>>> >>>>> Drivers wanting to use exported fences don't call fence->signaled or >>>>> fence->wait in atomic or interrupt context, and not with holding any global >>>>> locking primitives (like mmap_sem etc...). Holding locking primitives local >>>>> to the driver is ok, as long as they don't conflict with anything possible >>>>> used by their own fence implementation. >>>> Well that's almost what we have right now with the exception that >>>> drivers are allowed (actually must for correctness when updating >>>> fences) the ww_mutexes for dma-bufs (or other buffer objects). >>> In this case sorry for so much noise. I really haven't looked in so much detail into anything but Maarten's Radeon patches. >>> >>> But how does that then work right now? My impression was that it's mandatory for drivers to call fence_signaled()? >> It's only mandatory to call fence_signal() if the .enable_signaling callback has been called, else you can get away with never calling signaling a fence at all before dropping the last refcount to it. >> This allows you to keep interrupts disabled when you don't need them. > > Can we somehow avoid the need to call fence_signal() at all? The interrupts at least on radeon are way to unreliable for such a thing. Can enable_signalling fail? What's the reason for fence_signaled() in the first place? It doesn't need to be completely reliable, or finish immediately. And any time wake_up_all(&rdev->fence_queue) is called all the fences that were enabled will be rechecked. >>>> Agreed that any shared locks are out of the way (especially stuff like >>>> dev->struct_mutex or other non-strictly driver-private stuff, i915 is >>>> really bad here still). >>> Yeah that's also an point I've wanted to note on Maartens patch. Radeon grabs the read side of it's exclusive semaphore while waiting for fences (because it assumes that the fence it waits for is a Radeon fence). >>> >>> Assuming that we need to wait in both directions with Prime (e.g. Intel driver needs to wait for Radeon to finish rendering and Radeon needs to wait for Intel to finish displaying), this might become a perfect example of locking inversion. >> In the preliminary patches where I can sync radeon with other GPU's I've been very careful in all the places that call into fences, to make sure that radeon wouldn't try to handle lockups for a different (possibly also radeon) card. > > That's actually not such a good idea. > > In case of a lockup we need to handle the lockup cause otherwise it could happen that radeon waits for the lockup to be resolved and the lockup handling needs to wait for a fence that's never signaled because of the lockup. The lockup handling calls radeon_fence_wait, not the generic fence_wait. It doesn't call the exported wait function that takes the exclusive_lock in read mode. And lockdep should have complained if I screwed that up. :-) ~Maarten
On Wed, Jul 23, 2014 at 9:06 AM, Maarten Lankhorst <maarten.lankhorst@canonical.com> wrote: >> Can we somehow avoid the need to call fence_signal() at all? The interrupts at least on radeon are way to unreliable for such a thing. Can enable_signalling fail? What's the reason for fence_signaled() in the first place? > It doesn't need to be completely reliable, or finish immediately. > > And any time wake_up_all(&rdev->fence_queue) is called all the fences that were enabled will be rechecked. I raised this already somewhere else, but should we have some common infrastructure in the core fence code to recheck fences periodically? radeon doesn't seem to be the only hw where this isn't reliable enough. Of course timer-based rechecking would only work if the driver provides the fence->signalled callback to recheck actual fence state. -Daniel
Am 23.07.2014 09:09, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 9:06 AM, Maarten Lankhorst > <maarten.lankhorst@canonical.com> wrote: >>> Can we somehow avoid the need to call fence_signal() at all? The interrupts at least on radeon are way to unreliable for such a thing. Can enable_signalling fail? What's the reason for fence_signaled() in the first place? >> It doesn't need to be completely reliable, or finish immediately. >> >> And any time wake_up_all(&rdev->fence_queue) is called all the fences that were enabled will be rechecked. > I raised this already somewhere else, but should we have some common > infrastructure in the core fence code to recheck fences periodically? > radeon doesn't seem to be the only hw where this isn't reliable > enough. Of course timer-based rechecking would only work if the driver > provides the fence->signalled callback to recheck actual fence state. Yeah, agree. The proposal won't work reliable at all with radeon. Interrupts are accumulated before sending them to the CPU, e.g. you can get one interrupt for multiple fences finished. If it's just the interrupt for the last fence submitted that gets lost you are completely screwed up because you won't get another interrupt. I had that problem multiple times while working on UVD support, resulting in the driver thinking that it can't submit more jobs because non of the interrupts for the already submitted fence cam through. Apart from that interrupts on Macs usually don't work at all, so we really need a solution where calling fence_signaled() is completely optional. Christian. > -Daniel
Am 23.07.2014 09:06, schrieb Maarten Lankhorst: > op 23-07-14 08:52, Christian König schreef: >> Am 23.07.2014 08:40, schrieb Maarten Lankhorst: >>> op 22-07-14 17:59, Christian König schreef: >>>> Am 22.07.2014 17:42, schrieb Daniel Vetter: >>>>> On Tue, Jul 22, 2014 at 5:35 PM, Christian König >>>>> <christian.koenig@amd.com> wrote: >>>>>> Drivers exporting fences need to provide a fence->signaled and a fence->wait >>>>>> function, everything else like fence->enable_signaling or calling >>>>>> fence_signaled() from the driver is optional. >>>>>> >>>>>> Drivers wanting to use exported fences don't call fence->signaled or >>>>>> fence->wait in atomic or interrupt context, and not with holding any global >>>>>> locking primitives (like mmap_sem etc...). Holding locking primitives local >>>>>> to the driver is ok, as long as they don't conflict with anything possible >>>>>> used by their own fence implementation. >>>>> Well that's almost what we have right now with the exception that >>>>> drivers are allowed (actually must for correctness when updating >>>>> fences) the ww_mutexes for dma-bufs (or other buffer objects). >>>> In this case sorry for so much noise. I really haven't looked in so much detail into anything but Maarten's Radeon patches. >>>> >>>> But how does that then work right now? My impression was that it's mandatory for drivers to call fence_signaled()? >>> It's only mandatory to call fence_signal() if the .enable_signaling callback has been called, else you can get away with never calling signaling a fence at all before dropping the last refcount to it. >>> This allows you to keep interrupts disabled when you don't need them. >> Can we somehow avoid the need to call fence_signal() at all? The interrupts at least on radeon are way to unreliable for such a thing. Can enable_signalling fail? What's the reason for fence_signaled() in the first place? > It doesn't need to be completely reliable, or finish immediately. > > And any time wake_up_all(&rdev->fence_queue) is called all the fences that were enabled will be rechecked. > >>>>> Agreed that any shared locks are out of the way (especially stuff like >>>>> dev->struct_mutex or other non-strictly driver-private stuff, i915 is >>>>> really bad here still). >>>> Yeah that's also an point I've wanted to note on Maartens patch. Radeon grabs the read side of it's exclusive semaphore while waiting for fences (because it assumes that the fence it waits for is a Radeon fence). >>>> >>>> Assuming that we need to wait in both directions with Prime (e.g. Intel driver needs to wait for Radeon to finish rendering and Radeon needs to wait for Intel to finish displaying), this might become a perfect example of locking inversion. >>> In the preliminary patches where I can sync radeon with other GPU's I've been very careful in all the places that call into fences, to make sure that radeon wouldn't try to handle lockups for a different (possibly also radeon) card. >> That's actually not such a good idea. >> >> In case of a lockup we need to handle the lockup cause otherwise it could happen that radeon waits for the lockup to be resolved and the lockup handling needs to wait for a fence that's never signaled because of the lockup. > The lockup handling calls radeon_fence_wait, not the generic fence_wait. It doesn't call the exported wait function that takes the exclusive_lock in read mode. > And lockdep should have complained if I screwed that up. :-) You screwed it up and lockdep didn't warn you about it :-P It's not a locking problem I'm talking about here. Radeons lockup handling kicks in when anything calls into the driver from the outside, if you have a fence wait function that's called from the outside but doesn't handle lockups you essentially rely on somebody else calling another radeon function for the lockup to be resolved. Christian. > > ~Maarten >
On Wed, Jul 23, 2014 at 9:26 AM, Christian König <deathsimple@vodafone.de> wrote: > It's not a locking problem I'm talking about here. Radeons lockup handling > kicks in when anything calls into the driver from the outside, if you have a > fence wait function that's called from the outside but doesn't handle > lockups you essentially rely on somebody else calling another radeon > function for the lockup to be resolved. So you don't have a timer in radeon that periodically checks whether progress is still being made? That's the approach we're using in i915, together with some tricks to kick any stuck waiters so that we can reliably step in and grab locks for the reset. -Daniel
op 23-07-14 09:15, Christian König schreef: > Am 23.07.2014 09:09, schrieb Daniel Vetter: >> On Wed, Jul 23, 2014 at 9:06 AM, Maarten Lankhorst >> <maarten.lankhorst@canonical.com> wrote: >>>> Can we somehow avoid the need to call fence_signal() at all? The interrupts at least on radeon are way to unreliable for such a thing. Can enable_signalling fail? What's the reason for fence_signaled() in the first place? >>> It doesn't need to be completely reliable, or finish immediately. >>> >>> And any time wake_up_all(&rdev->fence_queue) is called all the fences that were enabled will be rechecked. >> I raised this already somewhere else, but should we have some common >> infrastructure in the core fence code to recheck fences periodically? >> radeon doesn't seem to be the only hw where this isn't reliable >> enough. Of course timer-based rechecking would only work if the driver >> provides the fence->signalled callback to recheck actual fence state. > > Yeah, agree. The proposal won't work reliable at all with radeon. > > Interrupts are accumulated before sending them to the CPU, e.g. you can get one interrupt for multiple fences finished. If it's just the interrupt for the last fence submitted that gets lost you are completely screwed up because you won't get another interrupt. > > I had that problem multiple times while working on UVD support, resulting in the driver thinking that it can't submit more jobs because non of the interrupts for the already submitted fence cam through. Yeah but all the fences that have .enable_signaling will get signaled from a single interrupt, or when any waiter calls radeon_fence_process. > Apart from that interrupts on Macs usually don't work at all, so we really need a solution where calling fence_signaled() is completely optional. I haven't had a problem with interrupts on my mbp after d1f9809ed1315c4cdc5760cf2f59626fd3276952, but it should be trivial to start a timer that periodically does wake_up_all, and gets its timeout reset in a call to radeon_fence_process. It could either be added as a work item, or as a normal timer (disabled during gpu lockup recovery to prevent checks from fiddling with things it shouldn't). ~Maarten
Am 23.07.2014 09:31, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 9:26 AM, Christian König > <deathsimple@vodafone.de> wrote: >> It's not a locking problem I'm talking about here. Radeons lockup handling >> kicks in when anything calls into the driver from the outside, if you have a >> fence wait function that's called from the outside but doesn't handle >> lockups you essentially rely on somebody else calling another radeon >> function for the lockup to be resolved. > So you don't have a timer in radeon that periodically checks whether > progress is still being made? That's the approach we're using in i915, > together with some tricks to kick any stuck waiters so that we can > reliably step in and grab locks for the reset. We tried this approach, but it didn't worked at all. I already considered trying it again because of the upcoming fence implementation, but reconsidering that when a driver is forced to change it's handling because of the fence implementation that's just another hint that there is something wrong here. Christian. > -Daniel
Am 23.07.2014 09:32, schrieb Maarten Lankhorst: > op 23-07-14 09:15, Christian König schreef: >> Am 23.07.2014 09:09, schrieb Daniel Vetter: >>> On Wed, Jul 23, 2014 at 9:06 AM, Maarten Lankhorst >>> <maarten.lankhorst@canonical.com> wrote: >>>>> Can we somehow avoid the need to call fence_signal() at all? The interrupts at least on radeon are way to unreliable for such a thing. Can enable_signalling fail? What's the reason for fence_signaled() in the first place? >>>> It doesn't need to be completely reliable, or finish immediately. >>>> >>>> And any time wake_up_all(&rdev->fence_queue) is called all the fences that were enabled will be rechecked. >>> I raised this already somewhere else, but should we have some common >>> infrastructure in the core fence code to recheck fences periodically? >>> radeon doesn't seem to be the only hw where this isn't reliable >>> enough. Of course timer-based rechecking would only work if the driver >>> provides the fence->signalled callback to recheck actual fence state. >> Yeah, agree. The proposal won't work reliable at all with radeon. >> >> Interrupts are accumulated before sending them to the CPU, e.g. you can get one interrupt for multiple fences finished. If it's just the interrupt for the last fence submitted that gets lost you are completely screwed up because you won't get another interrupt. >> >> I had that problem multiple times while working on UVD support, resulting in the driver thinking that it can't submit more jobs because non of the interrupts for the already submitted fence cam through. > Yeah but all the fences that have .enable_signaling will get signaled from a single interrupt, or when any waiter calls radeon_fence_process. You still need to check if the fence is really signaled, cause radeon_fence_process might wakeup the wait queue because of something completely different. Apart from that you once again rely on somebody else calling radeon_fence_process. This will probably work most of the times, but it's not 100% reliable. > >> Apart from that interrupts on Macs usually don't work at all, so we really need a solution where calling fence_signaled() is completely optional. > I haven't had a problem with interrupts on my mbp after d1f9809ed1315c4cdc5760cf2f59626fd3276952, but it should be trivial to start a timer that periodically does wake_up_all, and gets its timeout reset in a call to radeon_fence_process. It could either be added as a work item, or as a normal timer (disabled during gpu lockup recovery to prevent checks from fiddling with things it shouldn't). That will probably work, but just again sounds like we force the driver to fit the fence implementation instead of the other way around. Why is that fence_signaled() call needed for in the first place? Christian. > > ~Maarten >
op 23-07-14 09:37, Christian König schreef: > Am 23.07.2014 09:31, schrieb Daniel Vetter: >> On Wed, Jul 23, 2014 at 9:26 AM, Christian König >> <deathsimple@vodafone.de> wrote: >>> It's not a locking problem I'm talking about here. Radeons lockup handling >>> kicks in when anything calls into the driver from the outside, if you have a >>> fence wait function that's called from the outside but doesn't handle >>> lockups you essentially rely on somebody else calling another radeon >>> function for the lockup to be resolved. >> So you don't have a timer in radeon that periodically checks whether >> progress is still being made? That's the approach we're using in i915, >> together with some tricks to kick any stuck waiters so that we can >> reliably step in and grab locks for the reset. > > We tried this approach, but it didn't worked at all. > > I already considered trying it again because of the upcoming fence implementation, but reconsidering that when a driver is forced to change it's handling because of the fence implementation that's just another hint that there is something wrong here. As far as I can tell it wouldn't need to be reworked for the fence implementation currently, only the moment you want to allow callers outside of radeon. :-) Doing a GPU lockup recovery in the wait function would be messy even right now, you would hit a deadlock in ttm_bo_delayed_delete -> ttm_bo_cleanup_refs_and_unlock. Regardless of the fence implementation, why would it be a good idea to do a full lockup recovery when some other driver is calling your wait function? That doesn't seem to be a nice thing to do, so I think a timeout is the best error you could return here, other drivers have to deal with that anyway. ~Maarten
> Regardless of the fence implementation, why would it be a good idea to do a full lockup recovery when some other driver is > calling your wait function? That doesn't seem to be a nice thing to do, so I think a timeout is the best error you could return here, > other drivers have to deal with that anyway. The problem is that we need to guarantee that the lockup will be resolved eventually. Just imagine an application using prime is locking up Radeon and because of that gets killed by the user. Nothing else in the system would use the Radeon hardware any more and so radeon gets only called by another driver waiting patiently for radeon to finish rendering which never happens because the whole thing is locked up and we don't get a chance to recover. Christian. Am 23.07.2014 09:51, schrieb Maarten Lankhorst: > op 23-07-14 09:37, Christian König schreef: >> Am 23.07.2014 09:31, schrieb Daniel Vetter: >>> On Wed, Jul 23, 2014 at 9:26 AM, Christian König >>> <deathsimple@vodafone.de> wrote: >>>> It's not a locking problem I'm talking about here. Radeons lockup handling >>>> kicks in when anything calls into the driver from the outside, if you have a >>>> fence wait function that's called from the outside but doesn't handle >>>> lockups you essentially rely on somebody else calling another radeon >>>> function for the lockup to be resolved. >>> So you don't have a timer in radeon that periodically checks whether >>> progress is still being made? That's the approach we're using in i915, >>> together with some tricks to kick any stuck waiters so that we can >>> reliably step in and grab locks for the reset. >> We tried this approach, but it didn't worked at all. >> >> I already considered trying it again because of the upcoming fence implementation, but reconsidering that when a driver is forced to change it's handling because of the fence implementation that's just another hint that there is something wrong here. > As far as I can tell it wouldn't need to be reworked for the fence implementation currently, only the moment you want to allow callers outside of radeon. :-) > Doing a GPU lockup recovery in the wait function would be messy even right now, you would hit a deadlock in ttm_bo_delayed_delete -> ttm_bo_cleanup_refs_and_unlock. > > Regardless of the fence implementation, why would it be a good idea to do a full lockup recovery when some other driver is > calling your wait function? That doesn't seem to be a nice thing to do, so I think a timeout is the best error you could return here, > other drivers have to deal with that anyway. > > ~Maarten >
On Wed, Jul 23, 2014 at 9:37 AM, Christian König <christian.koenig@amd.com> wrote: > Am 23.07.2014 09:31, schrieb Daniel Vetter: >> On Wed, Jul 23, 2014 at 9:26 AM, Christian König >> <deathsimple@vodafone.de> wrote: >>> >>> It's not a locking problem I'm talking about here. Radeons lockup >>> handling >>> kicks in when anything calls into the driver from the outside, if you >>> have a >>> fence wait function that's called from the outside but doesn't handle >>> lockups you essentially rely on somebody else calling another radeon >>> function for the lockup to be resolved. >> >> So you don't have a timer in radeon that periodically checks whether >> progress is still being made? That's the approach we're using in i915, >> together with some tricks to kick any stuck waiters so that we can >> reliably step in and grab locks for the reset. > > > We tried this approach, but it didn't worked at all. > > I already considered trying it again because of the upcoming fence > implementation, but reconsidering that when a driver is forced to change > it's handling because of the fence implementation that's just another hint > that there is something wrong here. Out of curiosity: What's the blocker for using a timer/scheduled work to reset radeon? Getting this right on i915 has been fairly tricky and we now have an elaborate multi-stage state machine to get the driver through a reset. So always interested in different solutions. -Daniel
On Wed, Jul 23, 2014 at 9:58 AM, Christian König <deathsimple@vodafone.de> wrote: > Just imagine an application using prime is locking up Radeon and because of > that gets killed by the user. Nothing else in the system would use the > Radeon hardware any more and so radeon gets only called by another driver > waiting patiently for radeon to finish rendering which never happens because > the whole thing is locked up and we don't get a chance to recover. But isn't that possible already without fences? X hangs radeon, user crashes X for unrelated reasons before radeon will notice the hang. Then no one uses radeon any longer and the hang stays undetected. -Daniel
Am 23.07.2014 10:07, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 9:58 AM, Christian König > <deathsimple@vodafone.de> wrote: >> Just imagine an application using prime is locking up Radeon and because of >> that gets killed by the user. Nothing else in the system would use the >> Radeon hardware any more and so radeon gets only called by another driver >> waiting patiently for radeon to finish rendering which never happens because >> the whole thing is locked up and we don't get a chance to recover. > But isn't that possible already without fences? X hangs radeon, user > crashes X for unrelated reasons before radeon will notice the hang. > Then no one uses radeon any longer and the hang stays undetected. Yeah, especially with multimedia application. But I don't really care about this problem because the next time an application tries to use the block in question we actually do the reset and everything is fine. In your example we would do the reset when the next X server starts, before that point nobody would care because nobody uses the hardware. An additional problem here is that resets are something perfect normal for radeon. For example UVD can "crash" when you feed it with invalid bitstream data, (ok actually it send an interrupt and stops any processing for the driver to investigate). To continue processing you need to go through a rather complicated reset procedure. Christian. > -Daniel
op 23-07-14 10:20, Christian König schreef: > Am 23.07.2014 10:07, schrieb Daniel Vetter: >> On Wed, Jul 23, 2014 at 9:58 AM, Christian König >> <deathsimple@vodafone.de> wrote: >>> Just imagine an application using prime is locking up Radeon and because of >>> that gets killed by the user. Nothing else in the system would use the >>> Radeon hardware any more and so radeon gets only called by another driver >>> waiting patiently for radeon to finish rendering which never happens because >>> the whole thing is locked up and we don't get a chance to recover. >> But isn't that possible already without fences? X hangs radeon, user >> crashes X for unrelated reasons before radeon will notice the hang. >> Then no one uses radeon any longer and the hang stays undetected. > > Yeah, especially with multimedia application. But I don't really care about this problem because the next time an application tries to use the block in question we actually do the reset and everything is fine. > > In your example we would do the reset when the next X server starts, before that point nobody would care because nobody uses the hardware. > > An additional problem here is that resets are something perfect normal for radeon. For example UVD can "crash" when you feed it with invalid bitstream data, (ok actually it send an interrupt and stops any processing for the driver to investigate). To continue processing you need to go through a rather complicated reset procedure. In this case if the sync was to i915 the i915 lockup procedure would take care of itself. It wouldn't fix radeon, but it would at least unblock your intel card again. I haven't specifically added a special case to attempt to unblock external fences, but I've considered it. :-) ~Maarten
Am 23.07.2014 10:01, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 9:37 AM, Christian König > <christian.koenig@amd.com> wrote: >> Am 23.07.2014 09:31, schrieb Daniel Vetter: >>> On Wed, Jul 23, 2014 at 9:26 AM, Christian König >>> <deathsimple@vodafone.de> wrote: >>>> It's not a locking problem I'm talking about here. Radeons lockup >>>> handling >>>> kicks in when anything calls into the driver from the outside, if you >>>> have a >>>> fence wait function that's called from the outside but doesn't handle >>>> lockups you essentially rely on somebody else calling another radeon >>>> function for the lockup to be resolved. >>> So you don't have a timer in radeon that periodically checks whether >>> progress is still being made? That's the approach we're using in i915, >>> together with some tricks to kick any stuck waiters so that we can >>> reliably step in and grab locks for the reset. >> >> We tried this approach, but it didn't worked at all. >> >> I already considered trying it again because of the upcoming fence >> implementation, but reconsidering that when a driver is forced to change >> it's handling because of the fence implementation that's just another hint >> that there is something wrong here. > Out of curiosity: What's the blocker for using a timer/scheduled work > to reset radeon? Getting this right on i915 has been fairly tricky and > we now have an elaborate multi-stage state machine to get the driver > through a reset. So always interested in different solutions. IIRC we would have needed a quite advanced multi-stage state machine as well and that was just to much overhead at this point. One major problem was the power management in use back then, but that got replaced by DPM in the meantime. So it might be a good idea to try again. What we currently do is marking the driver as "needs reset" and returning -EAGAIN and then the next IOCTL starts the reset procedure before doing anything else. Christian. > -Daniel
On Wed, Jul 23, 2014 at 10:25 AM, Maarten Lankhorst
<maarten.lankhorst@canonical.com> wrote:
> In this case if the sync was to i915 the i915 lockup procedure would take care of itself. It wouldn't fix radeon, but it would at least unblock your intel card again. I haven't specifically added a special case to attempt to unblock external fences, but I've considered it. :-)
Actually the i915 reset stuff relies crucially on being able to kick
all waiters holding driver locks. Since the current fence code only
exposes an opaque wait function without exposing the underlying wait
queue we won't be able to sleep on both the fence queue and the reset
queue. So would pose a problem if we add fence_wait calls to our
driver.
-Daniel
Am 23.07.2014 10:42, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 10:25 AM, Maarten Lankhorst > <maarten.lankhorst@canonical.com> wrote: >> In this case if the sync was to i915 the i915 lockup procedure would take care of itself. It wouldn't fix radeon, but it would at least unblock your intel card again. I haven't specifically added a special case to attempt to unblock external fences, but I've considered it. :-) > Actually the i915 reset stuff relies crucially on being able to kick > all waiters holding driver locks. Since the current fence code only > exposes an opaque wait function without exposing the underlying wait > queue we won't be able to sleep on both the fence queue and the reset > queue. So would pose a problem if we add fence_wait calls to our > driver. And apart from that I really think that I misunderstood Maarten. But his explanation sounds like i915 would do a reset because Radeon is locked up, right? Well if that's really the case then I would question the interface even more, cause that is really nonsense. Christian. > -Daniel
On Wed, Jul 23, 2014 at 10:46 AM, Christian König <deathsimple@vodafone.de> wrote: > Am 23.07.2014 10:42, schrieb Daniel Vetter: > >> On Wed, Jul 23, 2014 at 10:25 AM, Maarten Lankhorst >> <maarten.lankhorst@canonical.com> wrote: >>> >>> In this case if the sync was to i915 the i915 lockup procedure would take >>> care of itself. It wouldn't fix radeon, but it would at least unblock your >>> intel card again. I haven't specifically added a special case to attempt to >>> unblock external fences, but I've considered it. :-) >> >> Actually the i915 reset stuff relies crucially on being able to kick >> all waiters holding driver locks. Since the current fence code only >> exposes an opaque wait function without exposing the underlying wait >> queue we won't be able to sleep on both the fence queue and the reset >> queue. So would pose a problem if we add fence_wait calls to our >> driver. > > > And apart from that I really think that I misunderstood Maarten. But his > explanation sounds like i915 would do a reset because Radeon is locked up, > right? > > Well if that's really the case then I would question the interface even > more, cause that is really nonsense. I disagree - the entire point of fences is that we can do multi-gpu work asynchronously. So by the time we'll notice that radeon's dead we have accepted the batch from userspace already. The only way to get rid of it again is through our reset machinery, which also tells userspace that we couldn't execute the batch. Whether we actually need to do a hw reset depends upon whether we've committed the batch to the hw already. Atm that's always the case, but the scheduler will change that. So I have no issues with intel doing a reset when other drivers don't signal fences. Also this isn't a problem with the interface really, but with the current implementation for radeon. And getting cross-driver reset notifications right will require more work either way. -Daniel
Am 23.07.2014 10:54, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 10:46 AM, Christian König > <deathsimple@vodafone.de> wrote: >> Am 23.07.2014 10:42, schrieb Daniel Vetter: >> >>> On Wed, Jul 23, 2014 at 10:25 AM, Maarten Lankhorst >>> <maarten.lankhorst@canonical.com> wrote: >>>> In this case if the sync was to i915 the i915 lockup procedure would take >>>> care of itself. It wouldn't fix radeon, but it would at least unblock your >>>> intel card again. I haven't specifically added a special case to attempt to >>>> unblock external fences, but I've considered it. :-) >>> Actually the i915 reset stuff relies crucially on being able to kick >>> all waiters holding driver locks. Since the current fence code only >>> exposes an opaque wait function without exposing the underlying wait >>> queue we won't be able to sleep on both the fence queue and the reset >>> queue. So would pose a problem if we add fence_wait calls to our >>> driver. >> >> And apart from that I really think that I misunderstood Maarten. But his >> explanation sounds like i915 would do a reset because Radeon is locked up, >> right? >> >> Well if that's really the case then I would question the interface even >> more, cause that is really nonsense. > I disagree - the entire point of fences is that we can do multi-gpu > work asynchronously. So by the time we'll notice that radeon's dead we > have accepted the batch from userspace already. The only way to get > rid of it again is through our reset machinery, which also tells > userspace that we couldn't execute the batch. Whether we actually need > to do a hw reset depends upon whether we've committed the batch to the > hw already. Atm that's always the case, but the scheduler will change > that. So I have no issues with intel doing a reset when other drivers > don't signal fences. You submit a job to the hardware and then block the job to wait for radeon to be finished? Well than this would indeed require a hardware reset, but wouldn't that make the whole problem even worse? I mean currently we block one userspace process to wait for other hardware to be finished with a buffer, but what you are describing here blocks the whole hardware to wait for other hardware which in the end blocks all userspace process accessing the hardware. Talking about alternative approaches wouldn't it be simpler to just offload the waiting to a different kernel or userspace thread? Christian. > > Also this isn't a problem with the interface really, but with the > current implementation for radeon. And getting cross-driver reset > notifications right will require more work either way. > -Daniel
On Wed, Jul 23, 2014 at 11:27 AM, Christian König <christian.koenig@amd.com> wrote: > You submit a job to the hardware and then block the job to wait for radeon > to be finished? Well than this would indeed require a hardware reset, but > wouldn't that make the whole problem even worse? > > I mean currently we block one userspace process to wait for other hardware > to be finished with a buffer, but what you are describing here blocks the > whole hardware to wait for other hardware which in the end blocks all > userspace process accessing the hardware. There is nothing new here with prime - if one context hangs the gpu it blocks everyone else on i915. > Talking about alternative approaches wouldn't it be simpler to just offload > the waiting to a different kernel or userspace thread? Well this is exactly what we'll do once we have the scheduler. But this is an orthogonal issue imo. -Daniel
Am 23.07.2014 11:30, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 11:27 AM, Christian König > <christian.koenig@amd.com> wrote: >> You submit a job to the hardware and then block the job to wait for radeon >> to be finished? Well than this would indeed require a hardware reset, but >> wouldn't that make the whole problem even worse? >> >> I mean currently we block one userspace process to wait for other hardware >> to be finished with a buffer, but what you are describing here blocks the >> whole hardware to wait for other hardware which in the end blocks all >> userspace process accessing the hardware. > There is nothing new here with prime - if one context hangs the gpu it > blocks everyone else on i915. > >> Talking about alternative approaches wouldn't it be simpler to just offload >> the waiting to a different kernel or userspace thread? > Well this is exactly what we'll do once we have the scheduler. But > this is an orthogonal issue imo. Mhm, could have the scheduler first? Cause that sounds like reducing the necessary fence interface to just a fence->wait function. Christian. > -Daniel
op 23-07-14 11:36, Christian König schreef: > Am 23.07.2014 11:30, schrieb Daniel Vetter: >> On Wed, Jul 23, 2014 at 11:27 AM, Christian König >> <christian.koenig@amd.com> wrote: >>> You submit a job to the hardware and then block the job to wait for radeon >>> to be finished? Well than this would indeed require a hardware reset, but >>> wouldn't that make the whole problem even worse? >>> >>> I mean currently we block one userspace process to wait for other hardware >>> to be finished with a buffer, but what you are describing here blocks the >>> whole hardware to wait for other hardware which in the end blocks all >>> userspace process accessing the hardware. >> There is nothing new here with prime - if one context hangs the gpu it >> blocks everyone else on i915. >> >>> Talking about alternative approaches wouldn't it be simpler to just offload >>> the waiting to a different kernel or userspace thread? >> Well this is exactly what we'll do once we have the scheduler. But >> this is an orthogonal issue imo. > > Mhm, could have the scheduler first? > > Cause that sounds like reducing the necessary fence interface to just a fence->wait function. You would also lose benefits like having a 'perf timechart' for gpu's. ~Maarten
On Wed, Jul 23, 2014 at 11:36 AM, Christian König <christian.koenig@amd.com> wrote: > Am 23.07.2014 11:30, schrieb Daniel Vetter: > >> On Wed, Jul 23, 2014 at 11:27 AM, Christian König >> <christian.koenig@amd.com> wrote: >>> >>> You submit a job to the hardware and then block the job to wait for >>> radeon >>> to be finished? Well than this would indeed require a hardware reset, but >>> wouldn't that make the whole problem even worse? >>> >>> I mean currently we block one userspace process to wait for other >>> hardware >>> to be finished with a buffer, but what you are describing here blocks the >>> whole hardware to wait for other hardware which in the end blocks all >>> userspace process accessing the hardware. >> >> There is nothing new here with prime - if one context hangs the gpu it >> blocks everyone else on i915. >> >>> Talking about alternative approaches wouldn't it be simpler to just >>> offload >>> the waiting to a different kernel or userspace thread? >> >> Well this is exactly what we'll do once we have the scheduler. But >> this is an orthogonal issue imo. > > > Mhm, could have the scheduler first? > > Cause that sounds like reducing the necessary fence interface to just a > fence->wait function. The scheduler needs to keep track of a lot of fences, so I think we'll have to register callbacks, not a simple wait function. We must keep track of all the non-i915 fences for all oustanding batches. Also, the scheduler doesn't eliminate the hw queue, only keep it much slower so that we can sneak in higher priority things. Really, scheduler or not is orthogonal. -Daniel
Am 23.07.2014 11:38, schrieb Maarten Lankhorst: > op 23-07-14 11:36, Christian König schreef: >> Am 23.07.2014 11:30, schrieb Daniel Vetter: >>> On Wed, Jul 23, 2014 at 11:27 AM, Christian König >>> <christian.koenig@amd.com> wrote: >>>> You submit a job to the hardware and then block the job to wait for radeon >>>> to be finished? Well than this would indeed require a hardware reset, but >>>> wouldn't that make the whole problem even worse? >>>> >>>> I mean currently we block one userspace process to wait for other hardware >>>> to be finished with a buffer, but what you are describing here blocks the >>>> whole hardware to wait for other hardware which in the end blocks all >>>> userspace process accessing the hardware. >>> There is nothing new here with prime - if one context hangs the gpu it >>> blocks everyone else on i915. >>> >>>> Talking about alternative approaches wouldn't it be simpler to just offload >>>> the waiting to a different kernel or userspace thread? >>> Well this is exactly what we'll do once we have the scheduler. But >>> this is an orthogonal issue imo. >> Mhm, could have the scheduler first? >> >> Cause that sounds like reducing the necessary fence interface to just a fence->wait function. > You would also lose benefits like having a 'perf timechart' for gpu's. I can live with that, when it reduces the complexity of the fence interface. Christian. > > ~Maarten >
On Wed, Jul 23, 2014 at 11:39 AM, Daniel Vetter <daniel.vetter@ffwll.ch> wrote: > The scheduler needs to keep track of a lot of fences, so I think we'll > have to register callbacks, not a simple wait function. We must keep > track of all the non-i915 fences for all oustanding batches. Also, the > scheduler doesn't eliminate the hw queue, only keep it much slower so > that we can sneak in higher priority things. > > Really, scheduler or not is orthogonal. Also see my other comment about interactions between wait_fence and the i915 reset logic. We can't actually use it from within the scheduler code since that would deadlock. -Daniel
Am 23.07.2014 11:44, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 11:39 AM, Daniel Vetter <daniel.vetter@ffwll.ch> wrote: >> The scheduler needs to keep track of a lot of fences, so I think we'll >> have to register callbacks, not a simple wait function. We must keep >> track of all the non-i915 fences for all oustanding batches. Also, the >> scheduler doesn't eliminate the hw queue, only keep it much slower so >> that we can sneak in higher priority things. >> >> Really, scheduler or not is orthogonal. > Also see my other comment about interactions between wait_fence and > the i915 reset logic. We can't actually use it from within the > scheduler code since that would deadlock. Yeah, I see. You would need some way to abort the waiting on other devices fences in case of a lockup. What about an userspace thread to offload waiting and command submission to? Just playing with ideas right now, Christian. > -Daniel
On Wed, Jul 23, 2014 at 11:47 AM, Christian König <deathsimple@vodafone.de> wrote: > Am 23.07.2014 11:44, schrieb Daniel Vetter: >> On Wed, Jul 23, 2014 at 11:39 AM, Daniel Vetter <daniel.vetter@ffwll.ch> >> wrote: >>> >>> The scheduler needs to keep track of a lot of fences, so I think we'll >>> have to register callbacks, not a simple wait function. We must keep >>> track of all the non-i915 fences for all oustanding batches. Also, the >>> scheduler doesn't eliminate the hw queue, only keep it much slower so >>> that we can sneak in higher priority things. >>> >>> Really, scheduler or not is orthogonal. >> >> Also see my other comment about interactions between wait_fence and >> the i915 reset logic. We can't actually use it from within the >> scheduler code since that would deadlock. > > > Yeah, I see. You would need some way to abort the waiting on other devices > fences in case of a lockup. > > What about an userspace thread to offload waiting and command submission to? That's what your android guys currently do. They hate it. And google explicitly created their syncpts stuff to move all that into the kernel. That one does explicit fencing, but the end result is still that you have fences as deps between different drivers. The other problem is that dri/prime is running under an implicitly sync'ed model, so there's no clear point/responsibility for who would actually do the waiting. You'll end up with synchronous behaviour since the render sooner or later needs to perfectly align with client/compositor ipc. -Daniel
op 23-07-14 11:47, Christian König schreef: > Am 23.07.2014 11:44, schrieb Daniel Vetter: >> On Wed, Jul 23, 2014 at 11:39 AM, Daniel Vetter <daniel.vetter@ffwll.ch> wrote: >>> The scheduler needs to keep track of a lot of fences, so I think we'll >>> have to register callbacks, not a simple wait function. We must keep >>> track of all the non-i915 fences for all oustanding batches. Also, the >>> scheduler doesn't eliminate the hw queue, only keep it much slower so >>> that we can sneak in higher priority things. >>> >>> Really, scheduler or not is orthogonal. >> Also see my other comment about interactions between wait_fence and >> the i915 reset logic. We can't actually use it from within the >> scheduler code since that would deadlock. > > Yeah, I see. You would need some way to abort the waiting on other devices fences in case of a lockup. > > What about an userspace thread to offload waiting and command submission to? You would still need enable_signaling, else polling on the dma-buf wouldn't work. ;-) Can't wait synchronously with multiple shared fences, need to poll for that. And the dma-buf would still have fences belonging to both drivers, and it would still call from outside the driver. ~Maarten
Am 23.07.2014 11:55, schrieb Maarten Lankhorst: > op 23-07-14 11:47, Christian König schreef: >> Am 23.07.2014 11:44, schrieb Daniel Vetter: >>> On Wed, Jul 23, 2014 at 11:39 AM, Daniel Vetter <daniel.vetter@ffwll.ch> wrote: >>>> The scheduler needs to keep track of a lot of fences, so I think we'll >>>> have to register callbacks, not a simple wait function. We must keep >>>> track of all the non-i915 fences for all oustanding batches. Also, the >>>> scheduler doesn't eliminate the hw queue, only keep it much slower so >>>> that we can sneak in higher priority things. >>>> >>>> Really, scheduler or not is orthogonal. >>> Also see my other comment about interactions between wait_fence and >>> the i915 reset logic. We can't actually use it from within the >>> scheduler code since that would deadlock. >> Yeah, I see. You would need some way to abort the waiting on other devices fences in case of a lockup. >> >> What about an userspace thread to offload waiting and command submission to? > You would still need enable_signaling, else polling on the dma-buf wouldn't work. ;-) > Can't wait synchronously with multiple shared fences, need to poll for that. No you don't. Just make a list of fences you need to wait for and wait for each after another. But having an thread for each command submission context doesn't sounds like the best solution anyway. > And the dma-buf would still have fences belonging to both drivers, and it would still call from outside the driver. Calling from outside the driver is fine as long as the driver can do everything necessary to complete it's work and isn't forced into any ugly hacks and things that are not 100% reliable. So I don't see much other approach as integrating recovery code for not firing interrupts and some kind of lockup handling into the fence code as well. Christian. > > ~Maarten >
On Wed, Jul 23, 2014 at 12:13 PM, Christian König <christian.koenig@amd.com> wrote: > >> And the dma-buf would still have fences belonging to both drivers, and it >> would still call from outside the driver. > > > Calling from outside the driver is fine as long as the driver can do > everything necessary to complete it's work and isn't forced into any ugly > hacks and things that are not 100% reliable. > > So I don't see much other approach as integrating recovery code for not > firing interrupts and some kind of lockup handling into the fence code as > well. That approach doesn't really work at that well since every driver has it's own reset semantics. And we're trying to move away from global reset to fine-grained reset. So stop-the-world reset is out of fashion, at least for i915. As you said, reset is normal in gpus and we're trying to make reset less invasive. I really don't see a point in imposing a reset scheme upon all drivers and I think you have about as much motivation to convert radeon to the scheme used by i915 as I'll have for converting to the one used by radeon. If it would fit at all. I guess for radeon we just have to add tons of insulation by punting all callbacks to work items so that radeon can do whatever it wants. Plus start a delayed_work based fallback when ->enable_signalling is called to make sure we work on platforms that lack interrupts. -Daniel
On Wed, Jul 23, 2014 at 2:52 AM, Christian König <christian.koenig@amd.com> wrote: > Am 23.07.2014 08:40, schrieb Maarten Lankhorst: > >> op 22-07-14 17:59, Christian König schreef: >>> >>> Am 22.07.2014 17:42, schrieb Daniel Vetter: >>>> >>>> On Tue, Jul 22, 2014 at 5:35 PM, Christian König >>>> <christian.koenig@amd.com> wrote: >>>>> >>>>> Drivers exporting fences need to provide a fence->signaled and a >>>>> fence->wait >>>>> function, everything else like fence->enable_signaling or calling >>>>> fence_signaled() from the driver is optional. >>>>> >>>>> Drivers wanting to use exported fences don't call fence->signaled or >>>>> fence->wait in atomic or interrupt context, and not with holding any >>>>> global >>>>> locking primitives (like mmap_sem etc...). Holding locking primitives >>>>> local >>>>> to the driver is ok, as long as they don't conflict with anything >>>>> possible >>>>> used by their own fence implementation. >>>> >>>> Well that's almost what we have right now with the exception that >>>> drivers are allowed (actually must for correctness when updating >>>> fences) the ww_mutexes for dma-bufs (or other buffer objects). >>> >>> In this case sorry for so much noise. I really haven't looked in so much >>> detail into anything but Maarten's Radeon patches. >>> >>> But how does that then work right now? My impression was that it's >>> mandatory for drivers to call fence_signaled()? >> >> It's only mandatory to call fence_signal() if the .enable_signaling >> callback has been called, else you can get away with never calling signaling >> a fence at all before dropping the last refcount to it. >> This allows you to keep interrupts disabled when you don't need them. > > > Can we somehow avoid the need to call fence_signal() at all? The interrupts > at least on radeon are way to unreliable for such a thing. Can > enable_signalling fail? What's the reason for fence_signaled() in the first > place? > the device you are sharing with may not be able to do hw<->hw signalling.. think about buffer sharing w/ camera, for example. You probably want your ->enable_signalling() to enable whatever workaround periodic-polling you need to do to catch missed irq's (and then call fence->signal() once you detect the fence has passed. fwiw, I haven't had a chance to read this whole thread yet, but I expect that a lot of the SoC devices, especially ones with separate kms-only display and gpu drivers, will want callback from gpu's irq to bang a few display controller registers. I agree in general callbacks from atomic ctx is probably something you want to avoid, but in this particular case I think it is worth the extra complexity. Nothing is stopping a driver from using a callback that just chucks something on a workqueue, whereas the inverse is not possible. BR, -R > >>>> Agreed that any shared locks are out of the way (especially stuff like >>>> dev->struct_mutex or other non-strictly driver-private stuff, i915 is >>>> really bad here still). >>> >>> Yeah that's also an point I've wanted to note on Maartens patch. Radeon >>> grabs the read side of it's exclusive semaphore while waiting for fences >>> (because it assumes that the fence it waits for is a Radeon fence). >>> >>> Assuming that we need to wait in both directions with Prime (e.g. Intel >>> driver needs to wait for Radeon to finish rendering and Radeon needs to wait >>> for Intel to finish displaying), this might become a perfect example of >>> locking inversion. >> >> In the preliminary patches where I can sync radeon with other GPU's I've >> been very careful in all the places that call into fences, to make sure that >> radeon wouldn't try to handle lockups for a different (possibly also radeon) >> card. > > > That's actually not such a good idea. > > In case of a lockup we need to handle the lockup cause otherwise it could > happen that radeon waits for the lockup to be resolved and the lockup > handling needs to wait for a fence that's never signaled because of the > lockup. > > Christian. > > >> >> This is also why fence_is_signaled should never block, and why it trylocks >> the exclusive_lock. :-) I think lockdep would complain if I grabbed >> exclusive_lock while blocking in is_signaled. >> >>>> So from the core fence framework I think we already have exactly this, >>>> and we only need to adjust the radeon implementation a bit to make it >>>> less risky and invasive to the radeon driver logic. >>> >>> Agree. Well the biggest problem I see is that exclusive semaphore I need >>> to take when anything calls into the driver. For the fence code I need to >>> move that down into the fence->signaled handler, cause that now can be >>> called from outside the driver. >>> >>> Maarten solved this by telling the driver in the lockup handler (where we >>> grab the write side of the exclusive lock) that all interrupts are already >>> enabled, so that fence->signaled hopefully wouldn't mess with the hardware >>> at all. While this probably works, it just leaves me with a feeling that we >>> are doing something wrong here. >> >> There is unfortunately no global mechanism to say 'this card is locked up, >> please don't call into any of my fences', and I don't associate fences with >> devices, and radeon doesn't keep a global list of fences. >> If all of that existed, it would complicate the interface and its callers >> a lot, while I like to keep things simple. >> So I did the best I could, and simply prevented the fence calls from >> fiddling with the hardware. Fortunately gpu lockup is not a common >> operation. :-) >> >> ~Maarten >> >> > > _______________________________________________ > dri-devel mailing list > dri-devel@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/dri-devel
Am 23.07.2014 12:52, schrieb Daniel Vetter: > On Wed, Jul 23, 2014 at 12:13 PM, Christian König > <christian.koenig@amd.com> wrote: >>> And the dma-buf would still have fences belonging to both drivers, and it >>> would still call from outside the driver. >> >> Calling from outside the driver is fine as long as the driver can do >> everything necessary to complete it's work and isn't forced into any ugly >> hacks and things that are not 100% reliable. >> >> So I don't see much other approach as integrating recovery code for not >> firing interrupts and some kind of lockup handling into the fence code as >> well. > That approach doesn't really work at that well since every driver has > it's own reset semantics. And we're trying to move away from global > reset to fine-grained reset. So stop-the-world reset is out of > fashion, at least for i915. As you said, reset is normal in gpus and > we're trying to make reset less invasive. I really don't see a point > in imposing a reset scheme upon all drivers and I think you have about > as much motivation to convert radeon to the scheme used by i915 as > I'll have for converting to the one used by radeon. If it would fit at > all. Oh my! No, I didn't wanted to suggest any global reset infrastructure. My idea was more that the fence framework provides a fence->process_signaling callback that is periodically called after enable_signaling is called to trigger manual signal processing in the driver. This would both be suitable as a fallback in case of not working interrupts as well as a chance for any driver to do necessary lockup handling. Christian. > I guess for radeon we just have to add tons of insulation by punting > all callbacks to work items so that radeon can do whatever it wants. > Plus start a delayed_work based fallback when ->enable_signalling is > called to make sure we work on platforms that lack interrupts. > -Daniel
On Wed, Jul 23, 2014 at 2:36 PM, Christian König <christian.koenig@amd.com> wrote: > My idea was more that the fence framework provides a > fence->process_signaling callback that is periodically called after > enable_signaling is called to trigger manual signal processing in the > driver. > > This would both be suitable as a fallback in case of not working interrupts > as well as a chance for any driver to do necessary lockup handling. Imo that should be an implementation detail of the fence provider. So in ->enable_signaling radeon needs to arm a delayed work to regularly check fence and signal them if the irq failed. If it's a common need we might provide some shared code for this (e.g. a struct unreliable_fence or so). But this shouldn't be mandatory since not all gpus are broken like that. And if we force other drivers to care for this special case that imo leaks the abstraction out of radeon (or any other driver with unreliable interrupts). -Daniel
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 29d9cc04c04e..03a5567f2c2f 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -64,6 +64,7 @@ #include <linux/wait.h> #include <linux/list.h> #include <linux/kref.h> +#include <linux/fence.h> #include <ttm/ttm_bo_api.h> #include <ttm/ttm_bo_driver.h> @@ -116,9 +117,6 @@ extern int radeon_deep_color; #define RADEONFB_CONN_LIMIT 4 #define RADEON_BIOS_NUM_SCRATCH 8 -/* fence seq are set to this number when signaled */ -#define RADEON_FENCE_SIGNALED_SEQ 0LL - /* internal ring indices */ /* r1xx+ has gfx CP ring */ #define RADEON_RING_TYPE_GFX_INDEX 0 @@ -350,12 +348,15 @@ struct radeon_fence_driver { }; struct radeon_fence { + struct fence base; + struct radeon_device *rdev; - struct kref kref; /* protected by radeon_fence.lock */ uint64_t seq; /* RB, DMA, etc. */ unsigned ring; + + wait_queue_t fence_wake; }; int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring); @@ -2268,6 +2269,7 @@ struct radeon_device { struct radeon_mman mman; struct radeon_fence_driver fence_drv[RADEON_NUM_RINGS]; wait_queue_head_t fence_queue; + unsigned fence_context; struct mutex ring_lock; struct radeon_ring ring[RADEON_NUM_RINGS]; bool ib_pool_ready; @@ -2358,11 +2360,6 @@ u32 cik_mm_rdoorbell(struct radeon_device *rdev, u32 index); void cik_mm_wdoorbell(struct radeon_device *rdev, u32 index, u32 v); /* - * Cast helper - */ -#define to_radeon_fence(p) ((struct radeon_fence *)(p)) - -/* * Registers read & write functions. */ #define RREG8(reg) readb((rdev->rmmio) + (reg)) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 03686fab842d..86699df7c8f3 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1213,6 +1213,7 @@ int radeon_device_init(struct radeon_device *rdev, for (i = 0; i < RADEON_NUM_RINGS; i++) { rdev->ring[i].idx = i; } + rdev->fence_context = fence_context_alloc(RADEON_NUM_RINGS); DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X).\n", radeon_family_name[rdev->family], pdev->vendor, pdev->device, @@ -1607,6 +1608,54 @@ int radeon_resume_kms(struct drm_device *dev, bool resume, bool fbcon) return 0; } +static uint32_t radeon_gpu_mask_sw_irq(struct radeon_device *rdev) +{ + uint32_t mask = 0; + int i; + + if (!rdev->ddev->irq_enabled) + return mask; + + /* + * increase refcount on sw interrupts for all rings to stop + * enabling interrupts in radeon_fence_enable_signaling during + * gpu reset. + */ + + for (i = 0; i < RADEON_NUM_RINGS; ++i) { + if (!rdev->ring[i].ready) + continue; + + atomic_inc(&rdev->irq.ring_int[i]); + mask |= 1 << i; + } + return mask; +} + +static void radeon_gpu_unmask_sw_irq(struct radeon_device *rdev, uint32_t mask) +{ + unsigned long irqflags; + int i; + + if (!mask) + return; + + /* + * undo refcount increase, and reset irqs to correct value. + */ + + for (i = 0; i < RADEON_NUM_RINGS; ++i) { + if (!(mask & (1 << i))) + continue; + + atomic_dec(&rdev->irq.ring_int[i]); + } + + spin_lock_irqsave(&rdev->irq.lock, irqflags); + radeon_irq_set(rdev); + spin_unlock_irqrestore(&rdev->irq.lock, irqflags); +} + /** * radeon_gpu_reset - reset the asic * @@ -1624,6 +1673,7 @@ int radeon_gpu_reset(struct radeon_device *rdev) int i, r; int resched; + uint32_t sw_mask; down_write(&rdev->exclusive_lock); @@ -1637,6 +1687,7 @@ int radeon_gpu_reset(struct radeon_device *rdev) radeon_save_bios_scratch_regs(rdev); /* block TTM */ resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev); + sw_mask = radeon_gpu_mask_sw_irq(rdev); radeon_pm_suspend(rdev); radeon_suspend(rdev); @@ -1686,13 +1737,20 @@ retry: radeon_pm_resume(rdev); drm_helper_resume_force_mode(rdev->ddev); + radeon_gpu_unmask_sw_irq(rdev, sw_mask); ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); if (r) { /* bad news, how to tell it to userspace ? */ dev_info(rdev->dev, "GPU reset failed\n"); } - up_write(&rdev->exclusive_lock); + /* + * force all waiters to recheck, some may have been + * added while the exclusive_lock was unavailable + */ + downgrade_write(&rdev->exclusive_lock); + wake_up_all(&rdev->fence_queue); + up_read(&rdev->exclusive_lock); return r; } diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 6435719fd45b..81c98f6ff0ca 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -39,6 +39,15 @@ #include "radeon.h" #include "radeon_trace.h" +static const struct fence_ops radeon_fence_ops; + +#define to_radeon_fence(p) \ + ({ \ + struct radeon_fence *__f; \ + __f = container_of((p), struct radeon_fence, base); \ + __f->base.ops == &radeon_fence_ops ? __f : NULL; \ + }) + /* * Fences * Fences mark an event in the GPUs pipeline and are used @@ -111,30 +120,55 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring) { + u64 seq = ++rdev->fence_drv[ring].sync_seq[ring]; + /* we are protected by the ring emission mutex */ *fence = kmalloc(sizeof(struct radeon_fence), GFP_KERNEL); if ((*fence) == NULL) { return -ENOMEM; } - kref_init(&((*fence)->kref)); - (*fence)->rdev = rdev; - (*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring]; (*fence)->ring = ring; + fence_init(&(*fence)->base, &radeon_fence_ops, + &rdev->fence_queue.lock, rdev->fence_context + ring, seq); + (*fence)->rdev = rdev; + (*fence)->seq = seq; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq); return 0; } /** - * radeon_fence_process - process a fence + * radeon_fence_check_signaled - callback from fence_queue * - * @rdev: radeon_device pointer - * @ring: ring index the fence is associated with - * - * Checks the current fence value and wakes the fence queue - * if the sequence number has increased (all asics). + * this function is called with fence_queue lock held, which is also used + * for the fence locking itself, so unlocked variants are used for + * fence_signal, and remove_wait_queue. */ -void radeon_fence_process(struct radeon_device *rdev, int ring) +static int radeon_fence_check_signaled(wait_queue_t *wait, unsigned mode, int flags, void *key) +{ + struct radeon_fence *fence; + u64 seq; + + fence = container_of(wait, struct radeon_fence, fence_wake); + + seq = atomic64_read(&fence->rdev->fence_drv[fence->ring].last_seq); + if (seq >= fence->seq) { + int ret = fence_signal_locked(&fence->base); + + if (!ret) + FENCE_TRACE(&fence->base, "signaled from irq context\n"); + else + FENCE_TRACE(&fence->base, "was already signaled\n"); + + radeon_irq_kms_sw_irq_put(fence->rdev, fence->ring); + __remove_wait_queue(&fence->rdev->fence_queue, &fence->fence_wake); + fence_put(&fence->base); + } else + FENCE_TRACE(&fence->base, "pending\n"); + return 0; +} + +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) { uint64_t seq, last_seq, last_emitted; unsigned count_loop = 0; @@ -190,23 +224,22 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) } } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); - if (wake) - wake_up_all(&rdev->fence_queue); + return wake; } /** - * radeon_fence_destroy - destroy a fence + * radeon_fence_process - process a fence * - * @kref: fence kref + * @rdev: radeon_device pointer + * @ring: ring index the fence is associated with * - * Frees the fence object (all asics). + * Checks the current fence value and wakes the fence queue + * if the sequence number has increased (all asics). */ -static void radeon_fence_destroy(struct kref *kref) +void radeon_fence_process(struct radeon_device *rdev, int ring) { - struct radeon_fence *fence; - - fence = container_of(kref, struct radeon_fence, kref); - kfree(fence); + if (__radeon_fence_process(rdev, ring)) + wake_up_all(&rdev->fence_queue); } /** @@ -237,6 +270,69 @@ static bool radeon_fence_seq_signaled(struct radeon_device *rdev, return false; } +static bool __radeon_fence_signaled(struct fence *f) +{ + struct radeon_fence *fence = to_radeon_fence(f); + struct radeon_device *rdev = fence->rdev; + unsigned ring = fence->ring; + u64 seq = fence->seq; + + if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) { + return true; + } + + if (down_read_trylock(&rdev->exclusive_lock)) { + radeon_fence_process(rdev, ring); + up_read(&rdev->exclusive_lock); + + if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) { + return true; + } + } + return false; +} + +/** + * radeon_fence_enable_signaling - enable signalling on fence + * @fence: fence + * + * This function is called with fence_queue lock held, and adds a callback + * to fence_queue that checks if this fence is signaled, and if so it + * signals the fence and removes itself. + */ +static bool radeon_fence_enable_signaling(struct fence *f) +{ + struct radeon_fence *fence = to_radeon_fence(f); + struct radeon_device *rdev = fence->rdev; + + if (atomic64_read(&rdev->fence_drv[fence->ring].last_seq) >= fence->seq || + !rdev->ddev->irq_enabled) + return false; + + radeon_irq_kms_sw_irq_get(rdev, fence->ring); + + if (down_read_trylock(&rdev->exclusive_lock)) { + if (__radeon_fence_process(rdev, fence->ring)) + wake_up_all_locked(&rdev->fence_queue); + + up_read(&rdev->exclusive_lock); + } + + /* did fence get signaled after we enabled the sw irq? */ + if (atomic64_read(&rdev->fence_drv[fence->ring].last_seq) >= fence->seq) { + radeon_irq_kms_sw_irq_put(rdev, fence->ring); + return false; + } + + fence->fence_wake.flags = 0; + fence->fence_wake.private = NULL; + fence->fence_wake.func = radeon_fence_check_signaled; + __add_wait_queue(&rdev->fence_queue, &fence->fence_wake); + fence_get(f); + + return true; +} + /** * radeon_fence_signaled - check if a fence has signaled * @@ -250,11 +346,13 @@ bool radeon_fence_signaled(struct radeon_fence *fence) if (!fence) { return true; } - if (fence->seq == RADEON_FENCE_SIGNALED_SEQ) { - return true; - } + if (radeon_fence_seq_signaled(fence->rdev, fence->seq, fence->ring)) { - fence->seq = RADEON_FENCE_SIGNALED_SEQ; + int ret; + + ret = fence_signal(&fence->base); + if (!ret) + FENCE_TRACE(&fence->base, "signaled from radeon_fence_signaled\n"); return true; } return false; @@ -413,21 +511,18 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) uint64_t seq[RADEON_NUM_RINGS] = {}; long r; - if (fence == NULL) { - WARN(1, "Querying an invalid fence : %p !\n", fence); - return -EINVAL; - } - - seq[fence->ring] = fence->seq; - if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ) + if (test_bit(FENCE_FLAG_SIGNALED_BIT, &fence->base.flags)) return 0; + seq[fence->ring] = fence->seq; r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT); if (r < 0) { return r; } - fence->seq = RADEON_FENCE_SIGNALED_SEQ; + r = fence_signal(&fence->base); + if (!r) + FENCE_TRACE(&fence->base, "signaled from fence_wait\n"); return 0; } @@ -459,12 +554,13 @@ int radeon_fence_wait_any(struct radeon_device *rdev, continue; } + if (test_bit(FENCE_FLAG_SIGNALED_BIT, &fences[i]->base.flags)) { + /* already signaled */ + return 0; + } + seq[i] = fences[i]->seq; ++num_rings; - - /* test if something was allready signaled */ - if (seq[i] == RADEON_FENCE_SIGNALED_SEQ) - return 0; } /* nothing to wait for ? */ @@ -545,7 +641,7 @@ int radeon_fence_wait_empty(struct radeon_device *rdev, int ring) */ struct radeon_fence *radeon_fence_ref(struct radeon_fence *fence) { - kref_get(&fence->kref); + fence_get(&fence->base); return fence; } @@ -561,9 +657,8 @@ void radeon_fence_unref(struct radeon_fence **fence) struct radeon_fence *tmp = *fence; *fence = NULL; - if (tmp) { - kref_put(&tmp->kref, radeon_fence_destroy); - } + if (tmp) + fence_put(&tmp->base); } /** @@ -872,3 +967,51 @@ int radeon_debugfs_fence_init(struct radeon_device *rdev) return 0; #endif } + +static long __radeon_fence_wait(struct fence *f, bool intr, long timeout) +{ + struct radeon_fence *fence = to_radeon_fence(f); + u64 target_seq[RADEON_NUM_RINGS] = {}; + struct radeon_device *rdev = fence->rdev; + long r; + + target_seq[fence->ring] = fence->seq; + + down_read(&rdev->exclusive_lock); + r = radeon_fence_wait_seq_timeout(fence->rdev, target_seq, intr, timeout); + + if (r > 0 && !fence_signal(&fence->base)) + FENCE_TRACE(&fence->base, "signaled from __radeon_fence_wait\n"); + + up_read(&rdev->exclusive_lock); + return r; + +} + +static const char *radeon_fence_get_driver_name(struct fence *fence) +{ + return "radeon"; +} + +static const char *radeon_fence_get_timeline_name(struct fence *f) +{ + struct radeon_fence *fence = to_radeon_fence(f); + switch (fence->ring) { + case RADEON_RING_TYPE_GFX_INDEX: return "radeon.gfx"; + case CAYMAN_RING_TYPE_CP1_INDEX: return "radeon.cp1"; + case CAYMAN_RING_TYPE_CP2_INDEX: return "radeon.cp2"; + case R600_RING_TYPE_DMA_INDEX: return "radeon.dma"; + case CAYMAN_RING_TYPE_DMA1_INDEX: return "radeon.dma1"; + case R600_RING_TYPE_UVD_INDEX: return "radeon.uvd"; + default: WARN_ON_ONCE(1); return "radeon.unk"; + } +} + +static const struct fence_ops radeon_fence_ops = { + .get_driver_name = radeon_fence_get_driver_name, + .get_timeline_name = radeon_fence_get_timeline_name, + .enable_signaling = radeon_fence_enable_signaling, + .signaled = __radeon_fence_signaled, + .wait = __radeon_fence_wait, + .release = NULL, +};
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> --- drivers/gpu/drm/radeon/radeon.h | 15 +- drivers/gpu/drm/radeon/radeon_device.c | 60 ++++++++- drivers/gpu/drm/radeon/radeon_fence.c | 223 ++++++++++++++++++++++++++------ 3 files changed, 248 insertions(+), 50 deletions(-)