diff mbox

drm/i915/userptr: Wrap mmu_notifier inside its own rw_semaphore

Message ID 20180326145955.4725-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson March 26, 2018, 2:59 p.m. UTC
We've always been blatantly ignoring mmu_notifier.h:

 * Invalidation of multiple concurrent ranges may be
 * optionally permitted by the driver. Either way the
 * establishment of sptes is forbidden in the range passed to
 * invalidate_range_begin/end for the whole duration of the
 * invalidate_range_begin/end critical section.

by not preventing concurrent calls to gup while an invalidate_range is
being processed. Wrap gup and invalidate_range in a paired rw_semaphore
to allow concurrent lookups, that are then interrupted and disabled
across the invalidate_range. Further refinement can be applied by
tracking the invalidate_range versus individual gup, which should be
done using a common set of helpers for all mmu_notifier subsystems share
the same need. I hear HMM is one such toolbox...

For the time being, assume concurrent invalidate and lookup are rare,
but not rare enough to completely ignore.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

Comments

Tvrtko Ursulin March 26, 2018, 3:59 p.m. UTC | #1
On 26/03/2018 15:59, Chris Wilson wrote:
> We've always been blatantly ignoring mmu_notifier.h:
> 
>   * Invalidation of multiple concurrent ranges may be
>   * optionally permitted by the driver. Either way the
>   * establishment of sptes is forbidden in the range passed to
>   * invalidate_range_begin/end for the whole duration of the
>   * invalidate_range_begin/end critical section.
> 
> by not preventing concurrent calls to gup while an invalidate_range is
> being processed. Wrap gup and invalidate_range in a paired rw_semaphore
> to allow concurrent lookups, that are then interrupted and disabled
> across the invalidate_range. Further refinement can be applied by
> tracking the invalidate_range versus individual gup, which should be
> done using a common set of helpers for all mmu_notifier subsystems share
> the same need. I hear HMM is one such toolbox...
> 
> For the time being, assume concurrent invalidate and lookup are rare,
> but not rare enough to completely ignore.

I think I suggested a few times we should just "ban" the object on first 
invalidate and never ever for its lifetime allow it to obtain backing 
store again. I just don't remember why we decided not to go with that 
approach. :( Thinking about it now I still don't see that this 
restriction would be a problem and would simplify things.

With more locks I am quite fearful what lockdep will say, but lets see...

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_userptr.c | 29 ++++++++++++++++++++++++++++-
>   1 file changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index d596a8302ca3..938107dffd37 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -47,6 +47,7 @@ struct i915_mm_struct {
>   
>   struct i915_mmu_notifier {
>   	spinlock_t lock;
> +	struct rw_semaphore sem;
>   	struct hlist_node node;
>   	struct mmu_notifier mn;
>   	struct rb_root_cached objects;
> @@ -123,6 +124,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   	struct interval_tree_node *it;
>   	LIST_HEAD(cancelled);
>   
> +	down_write(&mn->sem);
> +
>   	if (RB_EMPTY_ROOT(&mn->objects.rb_root))
>   		return;
>   
> @@ -156,8 +159,20 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>   		flush_workqueue(mn->wq);
>   }
>   
> +static void i915_gem_userptr_mn_invalidate_range_end(struct mmu_notifier *_mn,
> +						     struct mm_struct *mm,
> +						     unsigned long start,
> +						     unsigned long end)
> +{
> +	struct i915_mmu_notifier *mn =
> +		container_of(_mn, struct i915_mmu_notifier, mn);
> +
> +	up_write(&mn->sem);
> +}
> +
>   static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
>   	.invalidate_range_start = i915_gem_userptr_mn_invalidate_range_start,
> +	.invalidate_range_end = i915_gem_userptr_mn_invalidate_range_end,
>   };
>   
>   static struct i915_mmu_notifier *
> @@ -170,6 +185,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
>   		return ERR_PTR(-ENOMEM);
>   
>   	spin_lock_init(&mn->lock);
> +	init_rwsem(&mn->sem);
>   	mn->mn.ops = &i915_gem_userptr_notifier;
>   	mn->objects = RB_ROOT_CACHED;
>   	mn->wq = alloc_workqueue("i915-userptr-release",
> @@ -504,12 +520,15 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   
>   	pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
>   	if (pvec != NULL) {
> +		struct i915_mmu_notifier *mn = obj->userptr.mm->mn;
>   		struct mm_struct *mm = obj->userptr.mm->mm;
>   		unsigned int flags = 0;
>   
>   		if (!obj->userptr.read_only)
>   			flags |= FOLL_WRITE;
>   
> +		down_read(&mn->sem);
> +
>   		ret = -EFAULT;
>   		if (mmget_not_zero(mm)) {
>   			down_read(&mm->mmap_sem);
> @@ -528,6 +547,8 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   			up_read(&mm->mmap_sem);
>   			mmput(mm);
>   		}
> +
> +		up_read(&mn->sem);
>   	}
>   
>   	mutex_lock(&obj->mm.lock);
> @@ -636,15 +657,21 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
>   	pinned = 0;
>   
>   	if (mm == current->mm) {
> +		struct i915_mmu_notifier *mn = obj->userptr.mm->mn;
> +
>   		pvec = kvmalloc_array(num_pages, sizeof(struct page *),
>   				      GFP_KERNEL |
>   				      __GFP_NORETRY |
>   				      __GFP_NOWARN);
> -		if (pvec) /* defer to worker if malloc fails */
> +
> +		/* defer to worker if malloc fails */
> +		if (pvec && down_read_trylock(&mn->sem)) {
>   			pinned = __get_user_pages_fast(obj->userptr.ptr,
>   						       num_pages,
>   						       !obj->userptr.read_only,
>   						       pvec);
> +			up_read(&mn->sem);
> +		}
>   	}
>   
>   	active = false;
> 

Simple enough but I don't dare say anything until results from shards 
arrive.

Regards,

Tvrtko
Chris Wilson March 26, 2018, 4:28 p.m. UTC | #2
Quoting Tvrtko Ursulin (2018-03-26 16:59:20)
> 
> On 26/03/2018 15:59, Chris Wilson wrote:
> > We've always been blatantly ignoring mmu_notifier.h:
> > 
> >   * Invalidation of multiple concurrent ranges may be
> >   * optionally permitted by the driver. Either way the
> >   * establishment of sptes is forbidden in the range passed to
> >   * invalidate_range_begin/end for the whole duration of the
> >   * invalidate_range_begin/end critical section.
> > 
> > by not preventing concurrent calls to gup while an invalidate_range is
> > being processed. Wrap gup and invalidate_range in a paired rw_semaphore
> > to allow concurrent lookups, that are then interrupted and disabled
> > across the invalidate_range. Further refinement can be applied by
> > tracking the invalidate_range versus individual gup, which should be
> > done using a common set of helpers for all mmu_notifier subsystems share
> > the same need. I hear HMM is one such toolbox...
> > 
> > For the time being, assume concurrent invalidate and lookup are rare,
> > but not rare enough to completely ignore.
> 
> I think I suggested a few times we should just "ban" the object on first 
> invalidate and never ever for its lifetime allow it to obtain backing 
> store again. I just don't remember why we decided not to go with that 
> approach. :( Thinking about it now I still don't see that this 
> restriction would be a problem and would simplify things.

You still have the problem where it is being banned as we are trying to
instantiate it the first time. Imo, we are re-implementing mmap_sem
crudely. (Even more so when every mmu notifier must implement the same
code, and more than one will be called everytime the mm is touched.)

And we can get perfectly innocent invalidates, e.g. mprotect.
 
> With more locks I am quite fearful what lockdep will say, but lets see...

Same here.
-Chris
Daniel Vetter March 26, 2018, 7:45 p.m. UTC | #3
On Mon, Mar 26, 2018 at 05:28:58PM +0100, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-26 16:59:20)
> > 
> > On 26/03/2018 15:59, Chris Wilson wrote:
> > > We've always been blatantly ignoring mmu_notifier.h:
> > > 
> > >   * Invalidation of multiple concurrent ranges may be
> > >   * optionally permitted by the driver. Either way the
> > >   * establishment of sptes is forbidden in the range passed to
> > >   * invalidate_range_begin/end for the whole duration of the
> > >   * invalidate_range_begin/end critical section.
> > > 
> > > by not preventing concurrent calls to gup while an invalidate_range is
> > > being processed. Wrap gup and invalidate_range in a paired rw_semaphore
> > > to allow concurrent lookups, that are then interrupted and disabled
> > > across the invalidate_range. Further refinement can be applied by
> > > tracking the invalidate_range versus individual gup, which should be
> > > done using a common set of helpers for all mmu_notifier subsystems share
> > > the same need. I hear HMM is one such toolbox...
> > > 
> > > For the time being, assume concurrent invalidate and lookup are rare,
> > > but not rare enough to completely ignore.
> > 
> > I think I suggested a few times we should just "ban" the object on first 
> > invalidate and never ever for its lifetime allow it to obtain backing 
> > store again. I just don't remember why we decided not to go with that 
> > approach. :( Thinking about it now I still don't see that this 
> > restriction would be a problem and would simplify things.
> 
> You still have the problem where it is being banned as we are trying to
> instantiate it the first time. Imo, we are re-implementing mmap_sem
> crudely. (Even more so when every mmu notifier must implement the same
> code, and more than one will be called everytime the mm is touched.)

Jerome Glisse is promising to get that fixed and provide neater
primitives. Apparently we can reuse parts of the HMM stuff since that's
built as a helper library (even though the docs don't make it clear).

> And we can get perfectly innocent invalidates, e.g. mprotect.

Also hugepage consolidation, memory migration, swapout and everything else
which really should work.

> > With more locks I am quite fearful what lockdep will say, but lets see...
> 
> Same here.

Cross-release enabled lockdep would be even better, because of our various
worker tricks which do hide lock deps from current lockdep. We should
still have the required fixups hanging around in topic/core-for-CI.
-Daniel
Chris Wilson March 26, 2018, 8:08 p.m. UTC | #4
Quoting Patchwork (2018-03-26 17:53:44)
> Test gem_userptr_blits:
>         Subgroup coherency-unsync:
>                 pass       -> INCOMPLETE (shard-hsw)

Forgot that obj->userptr.mn may not exist.

>         Subgroup dmabuf-sync:
>                 pass       -> DMESG-WARN (shard-hsw)

But this is the tricky lockdep one, warning of the recursion from gup
into mmu_invalidate_range, i.e.

down_read(&i915_mmu_notifier->sem);
down_read(&mm_struct->mmap_sem);
	gup();
		down_write(&i915_mmut_notifier->sem);

That seems a genuine deadlock... So I wonder how we managed to get a
lockdep splat and not a dead machine. Maybe gup never triggers the
recursion for our set of flags? Hmm.
-Chris
Chris Wilson March 26, 2018, 10:38 p.m. UTC | #5
Quoting Chris Wilson (2018-03-26 21:08:33)
> Quoting Patchwork (2018-03-26 17:53:44)
> > Test gem_userptr_blits:
> >         Subgroup coherency-unsync:
> >                 pass       -> INCOMPLETE (shard-hsw)
> 
> Forgot that obj->userptr.mn may not exist.
> 
> >         Subgroup dmabuf-sync:
> >                 pass       -> DMESG-WARN (shard-hsw)
> 
> But this is the tricky lockdep one, warning of the recursion from gup
> into mmu_invalidate_range, i.e.
> 
> down_read(&i915_mmu_notifier->sem);
> down_read(&mm_struct->mmap_sem);
>         gup();
>                 down_write(&i915_mmut_notifier->sem);
> 
> That seems a genuine deadlock... So I wonder how we managed to get a
> lockdep splat and not a dead machine. Maybe gup never triggers the
> recursion for our set of flags? Hmm.

In another universe, CI found

[  255.666496] ======================================================
[  255.666498] WARNING: possible circular locking dependency detected
[  255.666500] 4.16.0-rc6-CI-Trybot_1944+ #1 Tainted: G     U  W       
[  255.666502] ------------------------------------------------------
[  255.666503] gem_userptr_bli/4794 is trying to acquire lock:
[  255.666505]  (fs_reclaim){+.+.}, at: [<00000000e1b95c73>] fs_reclaim_acquire.part.12+0x0/0x30
[  255.666510] 
               but task is already holding lock:
[  255.666512]  (&mn->sem){+.+.}, at: [<000000007c59ba79>] i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
[  255.666553] 
               which lock already depends on the new lock.

[  255.666555] 
               the existing dependency chain (in reverse order) is:
[  255.666557] 
               -> #2 (&mn->sem){+.+.}:
[  255.666578]        i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
[  255.666581]        __mmu_notifier_invalidate_range_start+0x73/0xb0
[  255.666584]        zap_page_range_single+0xcc/0xe0
[  255.666586]        unmap_mapping_pages+0xd4/0x110
[  255.666606]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
[  255.666625]        i915_vma_unbind+0x60a/0xa10 [i915]
[  255.666644]        i915_gem_object_set_tiling+0xf6/0x5b0 [i915]
[  255.666662]        i915_gem_set_tiling_ioctl+0x262/0x2f0 [i915]
[  255.666665]        drm_ioctl_kernel+0x60/0xa0
[  255.666667]        drm_ioctl+0x27e/0x320
[  255.666669]        do_vfs_ioctl+0x8a/0x670
[  255.666670]        SyS_ioctl+0x36/0x70
[  255.666672]        do_syscall_64+0x65/0x1a0
[  255.666675]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
[  255.666676] 
               -> #1 (&mapping->i_mmap_rwsem){++++}:
[  255.666680]        unmap_mapping_pages+0x3d/0x110
[  255.666698]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
[  255.666716]        i915_vma_unbind+0x60a/0xa10 [i915]
[  255.666734]        i915_gem_object_unbind+0xa0/0x130 [i915]
[  255.666751]        i915_gem_shrink+0x2d1/0x5d0 [i915]
[  255.666767]        i915_drop_caches_set+0x92/0x190 [i915]
[  255.666770]        simple_attr_write+0xab/0xc0
[  255.666772]        full_proxy_write+0x4b/0x70
[  255.666774]        __vfs_write+0x1e/0x130
[  255.666776]        vfs_write+0xbd/0x1b0
[  255.666778]        SyS_write+0x40/0xa0
[  255.666779]        do_syscall_64+0x65/0x1a0
[  255.666781]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
[  255.666783] 
               -> #0 (fs_reclaim){+.+.}:
[  255.666786]        fs_reclaim_acquire.part.12+0x24/0x30
[  255.666788]        __alloc_pages_nodemask+0x1f1/0x11d0
[  255.666790]        __get_free_pages+0x9/0x40
[  255.666792]        __pud_alloc+0x25/0xb0
[  255.666794]        copy_page_range+0xa75/0xaf0
[  255.666796]        copy_process.part.7+0x1267/0x1d90
[  255.666798]        _do_fork+0xc0/0x6b0
[  255.666800]        do_syscall_64+0x65/0x1a0
[  255.666801]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
[  255.666803] 
               other info that might help us debug this:

[  255.666805] Chain exists of:
                 fs_reclaim --> &mapping->i_mmap_rwsem --> &mn->sem

[  255.666809]  Possible unsafe locking scenario:

[  255.666811]        CPU0                    CPU1
[  255.666812]        ----                    ----
[  255.666814]   lock(&mn->sem);
[  255.666815]                                lock(&mapping->i_mmap_rwsem);
[  255.666817]                                lock(&mn->sem);
[  255.666819]   lock(fs_reclaim);
[  255.666821] 

So a shrinker deadlock. That doesn't look easy to wriggle out of, as we
have a random chunk of code that's between invalidate_range_start and
invalidate_range_end.
-Chris
Daniel Vetter March 27, 2018, 6:48 a.m. UTC | #6
On Mon, Mar 26, 2018 at 11:38:55PM +0100, Chris Wilson wrote:
> Quoting Chris Wilson (2018-03-26 21:08:33)
> > Quoting Patchwork (2018-03-26 17:53:44)
> > > Test gem_userptr_blits:
> > >         Subgroup coherency-unsync:
> > >                 pass       -> INCOMPLETE (shard-hsw)
> > 
> > Forgot that obj->userptr.mn may not exist.
> > 
> > >         Subgroup dmabuf-sync:
> > >                 pass       -> DMESG-WARN (shard-hsw)
> > 
> > But this is the tricky lockdep one, warning of the recursion from gup
> > into mmu_invalidate_range, i.e.
> > 
> > down_read(&i915_mmu_notifier->sem);
> > down_read(&mm_struct->mmap_sem);
> >         gup();
> >                 down_write(&i915_mmut_notifier->sem);
> > 
> > That seems a genuine deadlock... So I wonder how we managed to get a
> > lockdep splat and not a dead machine. Maybe gup never triggers the
> > recursion for our set of flags? Hmm.
> 
> In another universe, CI found
> 
> [  255.666496] ======================================================
> [  255.666498] WARNING: possible circular locking dependency detected
> [  255.666500] 4.16.0-rc6-CI-Trybot_1944+ #1 Tainted: G     U  W       
> [  255.666502] ------------------------------------------------------
> [  255.666503] gem_userptr_bli/4794 is trying to acquire lock:
> [  255.666505]  (fs_reclaim){+.+.}, at: [<00000000e1b95c73>] fs_reclaim_acquire.part.12+0x0/0x30
> [  255.666510] 
>                but task is already holding lock:
> [  255.666512]  (&mn->sem){+.+.}, at: [<000000007c59ba79>] i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> [  255.666553] 
>                which lock already depends on the new lock.
> 
> [  255.666555] 
>                the existing dependency chain (in reverse order) is:
> [  255.666557] 
>                -> #2 (&mn->sem){+.+.}:
> [  255.666578]        i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> [  255.666581]        __mmu_notifier_invalidate_range_start+0x73/0xb0
> [  255.666584]        zap_page_range_single+0xcc/0xe0
> [  255.666586]        unmap_mapping_pages+0xd4/0x110
> [  255.666606]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> [  255.666625]        i915_vma_unbind+0x60a/0xa10 [i915]
> [  255.666644]        i915_gem_object_set_tiling+0xf6/0x5b0 [i915]
> [  255.666662]        i915_gem_set_tiling_ioctl+0x262/0x2f0 [i915]
> [  255.666665]        drm_ioctl_kernel+0x60/0xa0
> [  255.666667]        drm_ioctl+0x27e/0x320
> [  255.666669]        do_vfs_ioctl+0x8a/0x670
> [  255.666670]        SyS_ioctl+0x36/0x70
> [  255.666672]        do_syscall_64+0x65/0x1a0
> [  255.666675]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> [  255.666676] 
>                -> #1 (&mapping->i_mmap_rwsem){++++}:
> [  255.666680]        unmap_mapping_pages+0x3d/0x110
> [  255.666698]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> [  255.666716]        i915_vma_unbind+0x60a/0xa10 [i915]
> [  255.666734]        i915_gem_object_unbind+0xa0/0x130 [i915]
> [  255.666751]        i915_gem_shrink+0x2d1/0x5d0 [i915]
> [  255.666767]        i915_drop_caches_set+0x92/0x190 [i915]
> [  255.666770]        simple_attr_write+0xab/0xc0
> [  255.666772]        full_proxy_write+0x4b/0x70
> [  255.666774]        __vfs_write+0x1e/0x130
> [  255.666776]        vfs_write+0xbd/0x1b0
> [  255.666778]        SyS_write+0x40/0xa0
> [  255.666779]        do_syscall_64+0x65/0x1a0
> [  255.666781]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> [  255.666783] 
>                -> #0 (fs_reclaim){+.+.}:
> [  255.666786]        fs_reclaim_acquire.part.12+0x24/0x30
> [  255.666788]        __alloc_pages_nodemask+0x1f1/0x11d0
> [  255.666790]        __get_free_pages+0x9/0x40
> [  255.666792]        __pud_alloc+0x25/0xb0
> [  255.666794]        copy_page_range+0xa75/0xaf0
> [  255.666796]        copy_process.part.7+0x1267/0x1d90
> [  255.666798]        _do_fork+0xc0/0x6b0
> [  255.666800]        do_syscall_64+0x65/0x1a0
> [  255.666801]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> [  255.666803] 
>                other info that might help us debug this:
> 
> [  255.666805] Chain exists of:
>                  fs_reclaim --> &mapping->i_mmap_rwsem --> &mn->sem
> 
> [  255.666809]  Possible unsafe locking scenario:
> 
> [  255.666811]        CPU0                    CPU1
> [  255.666812]        ----                    ----
> [  255.666814]   lock(&mn->sem);
> [  255.666815]                                lock(&mapping->i_mmap_rwsem);
> [  255.666817]                                lock(&mn->sem);
> [  255.666819]   lock(fs_reclaim);
> [  255.666821] 
> 
> So a shrinker deadlock. That doesn't look easy to wriggle out of, as we
> have a random chunk of code that's between invalidate_range_start and
> invalidate_range_end.

Christian König said something like "with this design you can't allocate
anything while holding locks you might need from the mmu notifier".
Because reclaim eats into the mmu notifiers.

But hey it's before coffee, so probably best you just ignore me :-)
-Daniel
Daniel Vetter March 27, 2018, 7:01 a.m. UTC | #7
On Mon, Mar 26, 2018 at 09:08:33PM +0100, Chris Wilson wrote:
> Quoting Patchwork (2018-03-26 17:53:44)
> > Test gem_userptr_blits:
> >         Subgroup coherency-unsync:
> >                 pass       -> INCOMPLETE (shard-hsw)
> 
> Forgot that obj->userptr.mn may not exist.
> 
> >         Subgroup dmabuf-sync:
> >                 pass       -> DMESG-WARN (shard-hsw)
> 
> But this is the tricky lockdep one, warning of the recursion from gup
> into mmu_invalidate_range, i.e.
> 
> down_read(&i915_mmu_notifier->sem);
> down_read(&mm_struct->mmap_sem);
> 	gup();
> 		down_write(&i915_mmut_notifier->sem);
> 
> That seems a genuine deadlock... So I wonder how we managed to get a
> lockdep splat and not a dead machine. Maybe gup never triggers the
> recursion for our set of flags? Hmm.

Coffee starting to kick in. If we gup a range it's likely the mm won't
kick out the same range, but something else. I guess we'd need a really
huge userptr bo which can't fit into core completely to actually have a
reliably chance at triggering this. Would probably deadlock the box :-/

I think Jerome's recommendation is the sequence counter stuff from kvm,
plus retrying forever on the gup side. That would convert the same
deadlock into a livelock, but well can't have it all :-) And I think once
you've killed the task the gup worker hopefully realizes it's wasting time
and gives up.

For the kvm stuff: Look at #intel-gfx scrollback, we discussed all the
necessary bits. Plus Jerome showed some new helpers that would avoid the
hand-rolling.
-Daniel
Chris Wilson March 27, 2018, 7:19 a.m. UTC | #8
Quoting Daniel Vetter (2018-03-27 07:48:00)
> On Mon, Mar 26, 2018 at 11:38:55PM +0100, Chris Wilson wrote:
> > Quoting Chris Wilson (2018-03-26 21:08:33)
> > > Quoting Patchwork (2018-03-26 17:53:44)
> > > > Test gem_userptr_blits:
> > > >         Subgroup coherency-unsync:
> > > >                 pass       -> INCOMPLETE (shard-hsw)
> > > 
> > > Forgot that obj->userptr.mn may not exist.
> > > 
> > > >         Subgroup dmabuf-sync:
> > > >                 pass       -> DMESG-WARN (shard-hsw)
> > > 
> > > But this is the tricky lockdep one, warning of the recursion from gup
> > > into mmu_invalidate_range, i.e.
> > > 
> > > down_read(&i915_mmu_notifier->sem);
> > > down_read(&mm_struct->mmap_sem);
> > >         gup();
> > >                 down_write(&i915_mmut_notifier->sem);
> > > 
> > > That seems a genuine deadlock... So I wonder how we managed to get a
> > > lockdep splat and not a dead machine. Maybe gup never triggers the
> > > recursion for our set of flags? Hmm.
> > 
> > In another universe, CI found
> > 
> > [  255.666496] ======================================================
> > [  255.666498] WARNING: possible circular locking dependency detected
> > [  255.666500] 4.16.0-rc6-CI-Trybot_1944+ #1 Tainted: G     U  W       
> > [  255.666502] ------------------------------------------------------
> > [  255.666503] gem_userptr_bli/4794 is trying to acquire lock:
> > [  255.666505]  (fs_reclaim){+.+.}, at: [<00000000e1b95c73>] fs_reclaim_acquire.part.12+0x0/0x30
> > [  255.666510] 
> >                but task is already holding lock:
> > [  255.666512]  (&mn->sem){+.+.}, at: [<000000007c59ba79>] i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> > [  255.666553] 
> >                which lock already depends on the new lock.
> > 
> > [  255.666555] 
> >                the existing dependency chain (in reverse order) is:
> > [  255.666557] 
> >                -> #2 (&mn->sem){+.+.}:
> > [  255.666578]        i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> > [  255.666581]        __mmu_notifier_invalidate_range_start+0x73/0xb0
> > [  255.666584]        zap_page_range_single+0xcc/0xe0
> > [  255.666586]        unmap_mapping_pages+0xd4/0x110
> > [  255.666606]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> > [  255.666625]        i915_vma_unbind+0x60a/0xa10 [i915]
> > [  255.666644]        i915_gem_object_set_tiling+0xf6/0x5b0 [i915]
> > [  255.666662]        i915_gem_set_tiling_ioctl+0x262/0x2f0 [i915]
> > [  255.666665]        drm_ioctl_kernel+0x60/0xa0
> > [  255.666667]        drm_ioctl+0x27e/0x320
> > [  255.666669]        do_vfs_ioctl+0x8a/0x670
> > [  255.666670]        SyS_ioctl+0x36/0x70
> > [  255.666672]        do_syscall_64+0x65/0x1a0
> > [  255.666675]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > [  255.666676] 
> >                -> #1 (&mapping->i_mmap_rwsem){++++}:
> > [  255.666680]        unmap_mapping_pages+0x3d/0x110
> > [  255.666698]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> > [  255.666716]        i915_vma_unbind+0x60a/0xa10 [i915]
> > [  255.666734]        i915_gem_object_unbind+0xa0/0x130 [i915]
> > [  255.666751]        i915_gem_shrink+0x2d1/0x5d0 [i915]
> > [  255.666767]        i915_drop_caches_set+0x92/0x190 [i915]
> > [  255.666770]        simple_attr_write+0xab/0xc0
> > [  255.666772]        full_proxy_write+0x4b/0x70
> > [  255.666774]        __vfs_write+0x1e/0x130
> > [  255.666776]        vfs_write+0xbd/0x1b0
> > [  255.666778]        SyS_write+0x40/0xa0
> > [  255.666779]        do_syscall_64+0x65/0x1a0
> > [  255.666781]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > [  255.666783] 
> >                -> #0 (fs_reclaim){+.+.}:
> > [  255.666786]        fs_reclaim_acquire.part.12+0x24/0x30
> > [  255.666788]        __alloc_pages_nodemask+0x1f1/0x11d0
> > [  255.666790]        __get_free_pages+0x9/0x40
> > [  255.666792]        __pud_alloc+0x25/0xb0
> > [  255.666794]        copy_page_range+0xa75/0xaf0
> > [  255.666796]        copy_process.part.7+0x1267/0x1d90
> > [  255.666798]        _do_fork+0xc0/0x6b0
> > [  255.666800]        do_syscall_64+0x65/0x1a0
> > [  255.666801]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > [  255.666803] 
> >                other info that might help us debug this:
> > 
> > [  255.666805] Chain exists of:
> >                  fs_reclaim --> &mapping->i_mmap_rwsem --> &mn->sem
> > 
> > [  255.666809]  Possible unsafe locking scenario:
> > 
> > [  255.666811]        CPU0                    CPU1
> > [  255.666812]        ----                    ----
> > [  255.666814]   lock(&mn->sem);
> > [  255.666815]                                lock(&mapping->i_mmap_rwsem);
> > [  255.666817]                                lock(&mn->sem);
> > [  255.666819]   lock(fs_reclaim);
> > [  255.666821] 
> > 
> > So a shrinker deadlock. That doesn't look easy to wriggle out of, as we
> > have a random chunk of code that's between invalidate_range_start and
> > invalidate_range_end.
> 
> Christian König said something like "with this design you can't allocate
> anything while holding locks you might need from the mmu notifier".
> Because reclaim eats into the mmu notifiers.

Oh, we aren't allocating from under the locks. That's the first thing I
double checked. Afaict, the only window is the code in the caller that's
between range_start/range_end. If that also can't touch fs_reclaim, then
this is just a red herring...
-Chris
Chris Wilson March 27, 2018, 7:21 a.m. UTC | #9
Quoting Daniel Vetter (2018-03-27 08:01:17)
> On Mon, Mar 26, 2018 at 09:08:33PM +0100, Chris Wilson wrote:
> > Quoting Patchwork (2018-03-26 17:53:44)
> > > Test gem_userptr_blits:
> > >         Subgroup coherency-unsync:
> > >                 pass       -> INCOMPLETE (shard-hsw)
> > 
> > Forgot that obj->userptr.mn may not exist.
> > 
> > >         Subgroup dmabuf-sync:
> > >                 pass       -> DMESG-WARN (shard-hsw)
> > 
> > But this is the tricky lockdep one, warning of the recursion from gup
> > into mmu_invalidate_range, i.e.
> > 
> > down_read(&i915_mmu_notifier->sem);
> > down_read(&mm_struct->mmap_sem);
> >       gup();
> >               down_write(&i915_mmut_notifier->sem);
> > 
> > That seems a genuine deadlock... So I wonder how we managed to get a
> > lockdep splat and not a dead machine. Maybe gup never triggers the
> > recursion for our set of flags? Hmm.
> 
> Coffee starting to kick in. If we gup a range it's likely the mm won't
> kick out the same range, but something else. I guess we'd need a really
> huge userptr bo which can't fit into core completely to actually have a
> reliably chance at triggering this. Would probably deadlock the box :-/
> 
> I think Jerome's recommendation is the sequence counter stuff from kvm,
> plus retrying forever on the gup side. That would convert the same
> deadlock into a livelock, but well can't have it all :-)

Pre-coffee state also thinks it would trigger the second fs_reclaim
lockdep if it was sufficiently annotated.
-Chris
Daniel Vetter March 27, 2018, 10:01 a.m. UTC | #10
On Tue, Mar 27, 2018 at 08:19:33AM +0100, Chris Wilson wrote:
> Quoting Daniel Vetter (2018-03-27 07:48:00)
> > On Mon, Mar 26, 2018 at 11:38:55PM +0100, Chris Wilson wrote:
> > > Quoting Chris Wilson (2018-03-26 21:08:33)
> > > > Quoting Patchwork (2018-03-26 17:53:44)
> > > > > Test gem_userptr_blits:
> > > > >         Subgroup coherency-unsync:
> > > > >                 pass       -> INCOMPLETE (shard-hsw)
> > > > 
> > > > Forgot that obj->userptr.mn may not exist.
> > > > 
> > > > >         Subgroup dmabuf-sync:
> > > > >                 pass       -> DMESG-WARN (shard-hsw)
> > > > 
> > > > But this is the tricky lockdep one, warning of the recursion from gup
> > > > into mmu_invalidate_range, i.e.
> > > > 
> > > > down_read(&i915_mmu_notifier->sem);
> > > > down_read(&mm_struct->mmap_sem);
> > > >         gup();
> > > >                 down_write(&i915_mmut_notifier->sem);
> > > > 
> > > > That seems a genuine deadlock... So I wonder how we managed to get a
> > > > lockdep splat and not a dead machine. Maybe gup never triggers the
> > > > recursion for our set of flags? Hmm.
> > > 
> > > In another universe, CI found
> > > 
> > > [  255.666496] ======================================================
> > > [  255.666498] WARNING: possible circular locking dependency detected
> > > [  255.666500] 4.16.0-rc6-CI-Trybot_1944+ #1 Tainted: G     U  W       
> > > [  255.666502] ------------------------------------------------------
> > > [  255.666503] gem_userptr_bli/4794 is trying to acquire lock:
> > > [  255.666505]  (fs_reclaim){+.+.}, at: [<00000000e1b95c73>] fs_reclaim_acquire.part.12+0x0/0x30
> > > [  255.666510] 
> > >                but task is already holding lock:
> > > [  255.666512]  (&mn->sem){+.+.}, at: [<000000007c59ba79>] i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> > > [  255.666553] 
> > >                which lock already depends on the new lock.
> > > 
> > > [  255.666555] 
> > >                the existing dependency chain (in reverse order) is:
> > > [  255.666557] 
> > >                -> #2 (&mn->sem){+.+.}:
> > > [  255.666578]        i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> > > [  255.666581]        __mmu_notifier_invalidate_range_start+0x73/0xb0
> > > [  255.666584]        zap_page_range_single+0xcc/0xe0
> > > [  255.666586]        unmap_mapping_pages+0xd4/0x110
> > > [  255.666606]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> > > [  255.666625]        i915_vma_unbind+0x60a/0xa10 [i915]
> > > [  255.666644]        i915_gem_object_set_tiling+0xf6/0x5b0 [i915]
> > > [  255.666662]        i915_gem_set_tiling_ioctl+0x262/0x2f0 [i915]
> > > [  255.666665]        drm_ioctl_kernel+0x60/0xa0
> > > [  255.666667]        drm_ioctl+0x27e/0x320
> > > [  255.666669]        do_vfs_ioctl+0x8a/0x670
> > > [  255.666670]        SyS_ioctl+0x36/0x70
> > > [  255.666672]        do_syscall_64+0x65/0x1a0
> > > [  255.666675]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > > [  255.666676] 
> > >                -> #1 (&mapping->i_mmap_rwsem){++++}:
> > > [  255.666680]        unmap_mapping_pages+0x3d/0x110
> > > [  255.666698]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> > > [  255.666716]        i915_vma_unbind+0x60a/0xa10 [i915]
> > > [  255.666734]        i915_gem_object_unbind+0xa0/0x130 [i915]
> > > [  255.666751]        i915_gem_shrink+0x2d1/0x5d0 [i915]
> > > [  255.666767]        i915_drop_caches_set+0x92/0x190 [i915]
> > > [  255.666770]        simple_attr_write+0xab/0xc0
> > > [  255.666772]        full_proxy_write+0x4b/0x70
> > > [  255.666774]        __vfs_write+0x1e/0x130
> > > [  255.666776]        vfs_write+0xbd/0x1b0
> > > [  255.666778]        SyS_write+0x40/0xa0
> > > [  255.666779]        do_syscall_64+0x65/0x1a0
> > > [  255.666781]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > > [  255.666783] 
> > >                -> #0 (fs_reclaim){+.+.}:
> > > [  255.666786]        fs_reclaim_acquire.part.12+0x24/0x30
> > > [  255.666788]        __alloc_pages_nodemask+0x1f1/0x11d0
> > > [  255.666790]        __get_free_pages+0x9/0x40
> > > [  255.666792]        __pud_alloc+0x25/0xb0
> > > [  255.666794]        copy_page_range+0xa75/0xaf0
> > > [  255.666796]        copy_process.part.7+0x1267/0x1d90
> > > [  255.666798]        _do_fork+0xc0/0x6b0
> > > [  255.666800]        do_syscall_64+0x65/0x1a0
> > > [  255.666801]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > > [  255.666803] 
> > >                other info that might help us debug this:
> > > 
> > > [  255.666805] Chain exists of:
> > >                  fs_reclaim --> &mapping->i_mmap_rwsem --> &mn->sem
> > > 
> > > [  255.666809]  Possible unsafe locking scenario:
> > > 
> > > [  255.666811]        CPU0                    CPU1
> > > [  255.666812]        ----                    ----
> > > [  255.666814]   lock(&mn->sem);
> > > [  255.666815]                                lock(&mapping->i_mmap_rwsem);
> > > [  255.666817]                                lock(&mn->sem);
> > > [  255.666819]   lock(fs_reclaim);
> > > [  255.666821] 
> > > 
> > > So a shrinker deadlock. That doesn't look easy to wriggle out of, as we
> > > have a random chunk of code that's between invalidate_range_start and
> > > invalidate_range_end.
> > 
> > Christian König said something like "with this design you can't allocate
> > anything while holding locks you might need from the mmu notifier".
> > Because reclaim eats into the mmu notifiers.
> 
> Oh, we aren't allocating from under the locks. That's the first thing I
> double checked. Afaict, the only window is the code in the caller that's
> between range_start/range_end. If that also can't touch fs_reclaim, then
> this is just a red herring...

Where is that happening? You left out the backtrace for the fs_reclaim hit
that closed the loop. Is it our code, or just something else going on?
-Daniel
Chris Wilson March 27, 2018, 10:10 a.m. UTC | #11
Quoting Daniel Vetter (2018-03-27 11:01:09)
> On Tue, Mar 27, 2018 at 08:19:33AM +0100, Chris Wilson wrote:
> > Quoting Daniel Vetter (2018-03-27 07:48:00)
> > > On Mon, Mar 26, 2018 at 11:38:55PM +0100, Chris Wilson wrote:
> > > > Quoting Chris Wilson (2018-03-26 21:08:33)
> > > > > Quoting Patchwork (2018-03-26 17:53:44)
> > > > > > Test gem_userptr_blits:
> > > > > >         Subgroup coherency-unsync:
> > > > > >                 pass       -> INCOMPLETE (shard-hsw)
> > > > > 
> > > > > Forgot that obj->userptr.mn may not exist.
> > > > > 
> > > > > >         Subgroup dmabuf-sync:
> > > > > >                 pass       -> DMESG-WARN (shard-hsw)
> > > > > 
> > > > > But this is the tricky lockdep one, warning of the recursion from gup
> > > > > into mmu_invalidate_range, i.e.
> > > > > 
> > > > > down_read(&i915_mmu_notifier->sem);
> > > > > down_read(&mm_struct->mmap_sem);
> > > > >         gup();
> > > > >                 down_write(&i915_mmut_notifier->sem);
> > > > > 
> > > > > That seems a genuine deadlock... So I wonder how we managed to get a
> > > > > lockdep splat and not a dead machine. Maybe gup never triggers the
> > > > > recursion for our set of flags? Hmm.
> > > > 
> > > > In another universe, CI found
> > > > 
> > > > [  255.666496] ======================================================
> > > > [  255.666498] WARNING: possible circular locking dependency detected
> > > > [  255.666500] 4.16.0-rc6-CI-Trybot_1944+ #1 Tainted: G     U  W       
> > > > [  255.666502] ------------------------------------------------------
> > > > [  255.666503] gem_userptr_bli/4794 is trying to acquire lock:
> > > > [  255.666505]  (fs_reclaim){+.+.}, at: [<00000000e1b95c73>] fs_reclaim_acquire.part.12+0x0/0x30
> > > > [  255.666510] 
> > > >                but task is already holding lock:
> > > > [  255.666512]  (&mn->sem){+.+.}, at: [<000000007c59ba79>] i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> > > > [  255.666553] 
> > > >                which lock already depends on the new lock.
> > > > 
> > > > [  255.666555] 
> > > >                the existing dependency chain (in reverse order) is:
> > > > [  255.666557] 
> > > >                -> #2 (&mn->sem){+.+.}:
> > > > [  255.666578]        i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
> > > > [  255.666581]        __mmu_notifier_invalidate_range_start+0x73/0xb0
> > > > [  255.666584]        zap_page_range_single+0xcc/0xe0
> > > > [  255.666586]        unmap_mapping_pages+0xd4/0x110
> > > > [  255.666606]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> > > > [  255.666625]        i915_vma_unbind+0x60a/0xa10 [i915]
> > > > [  255.666644]        i915_gem_object_set_tiling+0xf6/0x5b0 [i915]
> > > > [  255.666662]        i915_gem_set_tiling_ioctl+0x262/0x2f0 [i915]
> > > > [  255.666665]        drm_ioctl_kernel+0x60/0xa0
> > > > [  255.666667]        drm_ioctl+0x27e/0x320
> > > > [  255.666669]        do_vfs_ioctl+0x8a/0x670
> > > > [  255.666670]        SyS_ioctl+0x36/0x70
> > > > [  255.666672]        do_syscall_64+0x65/0x1a0
> > > > [  255.666675]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > > > [  255.666676] 
> > > >                -> #1 (&mapping->i_mmap_rwsem){++++}:
> > > > [  255.666680]        unmap_mapping_pages+0x3d/0x110
> > > > [  255.666698]        i915_vma_revoke_mmap+0x7e/0x1c0 [i915]
> > > > [  255.666716]        i915_vma_unbind+0x60a/0xa10 [i915]
> > > > [  255.666734]        i915_gem_object_unbind+0xa0/0x130 [i915]
> > > > [  255.666751]        i915_gem_shrink+0x2d1/0x5d0 [i915]
> > > > [  255.666767]        i915_drop_caches_set+0x92/0x190 [i915]
> > > > [  255.666770]        simple_attr_write+0xab/0xc0
> > > > [  255.666772]        full_proxy_write+0x4b/0x70
> > > > [  255.666774]        __vfs_write+0x1e/0x130
> > > > [  255.666776]        vfs_write+0xbd/0x1b0
> > > > [  255.666778]        SyS_write+0x40/0xa0
> > > > [  255.666779]        do_syscall_64+0x65/0x1a0
> > > > [  255.666781]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > > > [  255.666783] 
> > > >                -> #0 (fs_reclaim){+.+.}:
> > > > [  255.666786]        fs_reclaim_acquire.part.12+0x24/0x30
> > > > [  255.666788]        __alloc_pages_nodemask+0x1f1/0x11d0
> > > > [  255.666790]        __get_free_pages+0x9/0x40
> > > > [  255.666792]        __pud_alloc+0x25/0xb0
> > > > [  255.666794]        copy_page_range+0xa75/0xaf0
> > > > [  255.666796]        copy_process.part.7+0x1267/0x1d90
> > > > [  255.666798]        _do_fork+0xc0/0x6b0
> > > > [  255.666800]        do_syscall_64+0x65/0x1a0
> > > > [  255.666801]        entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > > > [  255.666803] 
> > > >                other info that might help us debug this:
> > > > 
> > > > [  255.666805] Chain exists of:
> > > >                  fs_reclaim --> &mapping->i_mmap_rwsem --> &mn->sem
> > > > 
> > > > [  255.666809]  Possible unsafe locking scenario:
> > > > 
> > > > [  255.666811]        CPU0                    CPU1
> > > > [  255.666812]        ----                    ----
> > > > [  255.666814]   lock(&mn->sem);
> > > > [  255.666815]                                lock(&mapping->i_mmap_rwsem);
> > > > [  255.666817]                                lock(&mn->sem);
> > > > [  255.666819]   lock(fs_reclaim);
> > > > [  255.666821] 
> > > > 
> > > > So a shrinker deadlock. That doesn't look easy to wriggle out of, as we
> > > > have a random chunk of code that's between invalidate_range_start and
> > > > invalidate_range_end.
> > > 
> > > Christian König said something like "with this design you can't allocate
> > > anything while holding locks you might need from the mmu notifier".
> > > Because reclaim eats into the mmu notifiers.
> > 
> > Oh, we aren't allocating from under the locks. That's the first thing I
> > double checked. Afaict, the only window is the code in the caller that's
> > between range_start/range_end. If that also can't touch fs_reclaim, then
> > this is just a red herring...
> 
> Where is that happening? You left out the backtrace for the fs_reclaim hit
> that closed the loop. Is it our code, or just something else going on?

It was from fork:

[   48.013723] 3 locks held by gem_userptr_bli/1336:
[   48.013725]  #0:  (&mm->mmap_sem){++++}, at: [<000000007282305d>] copy_process.part.7+0xe29/0x1d90
[   48.013730]  #1:  (&mm->mmap_sem/1){+.+.}, at: [<000000008e133750>] copy_process.part.7+0xe4d/0x1d90
[   48.013735]  #2:  (&mn->sem){+.+.}, at: [<0000000076ac255d>] i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
[   48.013759] 
               stack backtrace:
[   48.013762] CPU: 1 PID: 1336 Comm: gem_userptr_bli Tainted: G     U           4.16.0-rc6-CI-Trybot_1944+ #1
[   48.013765] Hardware name:  /NUC7i5BNB, BIOS BNKBL357.86A.0054.2017.1025.1822 10/25/2017
[   48.013768] Call Trace:
[   48.013771]  dump_stack+0x5f/0x86
[   48.013774]  print_circular_bug.isra.18+0x1d0/0x2c0
[   48.013777]  __lock_acquire+0x14ae/0x1b60
[   48.013781]  ? lock_acquire+0xaf/0x200
[   48.013783]  lock_acquire+0xaf/0x200
[   48.013785]  ? page_frag_free+0x60/0x60
[   48.013788]  fs_reclaim_acquire.part.12+0x24/0x30
[   48.013790]  ? page_frag_free+0x60/0x60
[   48.013792]  __alloc_pages_nodemask+0x1f1/0x11d0
[   48.013814]  ? i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
[   48.013817]  ? reacquire_held_locks+0xa2/0x170
[   48.013819]  ? reacquire_held_locks+0xa2/0x170
[   48.013840]  ? i915_gem_userptr_mn_invalidate_range_start+0x3e/0x1a0 [i915]
[   48.013844]  ? __mmu_notifier_invalidate_range_start+0x7b/0xb0
[   48.013847]  __get_free_pages+0x9/0x40
[   48.013849]  __pud_alloc+0x25/0xb0
[   48.013851]  copy_page_range+0xa75/0xaf0
[   48.013854]  ? lock_acquire+0xaf/0x200
[   48.013857]  copy_process.part.7+0x1267/0x1d90
[   48.013861]  _do_fork+0xc0/0x6b0
[   48.013864]  ? entry_SYSCALL_64_after_hwframe+0x52/0xb7
[   48.013866]  ? do_syscall_64+0x19/0x1a0
[   48.013868]  do_syscall_64+0x65/0x1a0
[   48.013871]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

The "? i915_gem" stuff looked incriminating but have to be a mirage given
the caller, but we are inside a range_start/range_end when it does the
alloc_pages.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d596a8302ca3..938107dffd37 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -47,6 +47,7 @@  struct i915_mm_struct {
 
 struct i915_mmu_notifier {
 	spinlock_t lock;
+	struct rw_semaphore sem;
 	struct hlist_node node;
 	struct mmu_notifier mn;
 	struct rb_root_cached objects;
@@ -123,6 +124,8 @@  static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 	struct interval_tree_node *it;
 	LIST_HEAD(cancelled);
 
+	down_write(&mn->sem);
+
 	if (RB_EMPTY_ROOT(&mn->objects.rb_root))
 		return;
 
@@ -156,8 +159,20 @@  static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 		flush_workqueue(mn->wq);
 }
 
+static void i915_gem_userptr_mn_invalidate_range_end(struct mmu_notifier *_mn,
+						     struct mm_struct *mm,
+						     unsigned long start,
+						     unsigned long end)
+{
+	struct i915_mmu_notifier *mn =
+		container_of(_mn, struct i915_mmu_notifier, mn);
+
+	up_write(&mn->sem);
+}
+
 static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
 	.invalidate_range_start = i915_gem_userptr_mn_invalidate_range_start,
+	.invalidate_range_end = i915_gem_userptr_mn_invalidate_range_end,
 };
 
 static struct i915_mmu_notifier *
@@ -170,6 +185,7 @@  i915_mmu_notifier_create(struct mm_struct *mm)
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&mn->lock);
+	init_rwsem(&mn->sem);
 	mn->mn.ops = &i915_gem_userptr_notifier;
 	mn->objects = RB_ROOT_CACHED;
 	mn->wq = alloc_workqueue("i915-userptr-release",
@@ -504,12 +520,15 @@  __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 
 	pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
 	if (pvec != NULL) {
+		struct i915_mmu_notifier *mn = obj->userptr.mm->mn;
 		struct mm_struct *mm = obj->userptr.mm->mm;
 		unsigned int flags = 0;
 
 		if (!obj->userptr.read_only)
 			flags |= FOLL_WRITE;
 
+		down_read(&mn->sem);
+
 		ret = -EFAULT;
 		if (mmget_not_zero(mm)) {
 			down_read(&mm->mmap_sem);
@@ -528,6 +547,8 @@  __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 			up_read(&mm->mmap_sem);
 			mmput(mm);
 		}
+
+		up_read(&mn->sem);
 	}
 
 	mutex_lock(&obj->mm.lock);
@@ -636,15 +657,21 @@  static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
 	pinned = 0;
 
 	if (mm == current->mm) {
+		struct i915_mmu_notifier *mn = obj->userptr.mm->mn;
+
 		pvec = kvmalloc_array(num_pages, sizeof(struct page *),
 				      GFP_KERNEL |
 				      __GFP_NORETRY |
 				      __GFP_NOWARN);
-		if (pvec) /* defer to worker if malloc fails */
+
+		/* defer to worker if malloc fails */
+		if (pvec && down_read_trylock(&mn->sem)) {
 			pinned = __get_user_pages_fast(obj->userptr.ptr,
 						       num_pages,
 						       !obj->userptr.read_only,
 						       pvec);
+			up_read(&mn->sem);
+		}
 	}
 
 	active = false;