Message ID | 20190606184438.31646-3-jgg@ziepe.ca (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | Various revisions from a locking/code review | expand |
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: > From: Jason Gunthorpe <jgg@mellanox.com> > > Ralph observes that hmm_range_register() can only be called by a driver > while a mirror is registered. Make this clear in the API by passing in the > mirror structure as a parameter. > > This also simplifies understanding the lifetime model for struct hmm, as > the hmm pointer must be valid as part of a registered mirror so all we > need in hmm_register_range() is a simple kref_get. > > Suggested-by: Ralph Campbell <rcampbell@nvidia.com> > Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> > --- > v2 > - Include the oneline patch to nouveau_svm.c > --- > drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- > include/linux/hmm.h | 7 ++++--- > mm/hmm.c | 15 ++++++--------- > 3 files changed, 11 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c > index 93ed43c413f0bb..8c92374afcf227 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_svm.c > +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c > @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) > range.values = nouveau_svm_pfn_values; > range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; > again: > - ret = hmm_vma_fault(&range, true); > + ret = hmm_vma_fault(&svmm->mirror, &range, true); > if (ret == 0) { > mutex_lock(&svmm->mutex); > if (!hmm_vma_range_done(&range)) { > diff --git a/include/linux/hmm.h b/include/linux/hmm.h > index 688c5ca7068795..2d519797cb134a 100644 > --- a/include/linux/hmm.h > +++ b/include/linux/hmm.h > @@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) > * Please see Documentation/vm/hmm.rst for how to use the range API. > */ > int hmm_range_register(struct hmm_range *range, > - struct mm_struct *mm, > + struct hmm_mirror *mirror, > unsigned long start, > unsigned long end, > unsigned page_shift); > @@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) > } > > /* This is a temporary helper to avoid merge conflict between trees. */ > -static inline int hmm_vma_fault(struct hmm_range *range, bool block) > +static inline int hmm_vma_fault(struct hmm_mirror *mirror, > + struct hmm_range *range, bool block) > { > long ret; > > @@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) > range->default_flags = 0; > range->pfn_flags_mask = -1UL; > > - ret = hmm_range_register(range, range->vma->vm_mm, > + ret = hmm_range_register(range, mirror, > range->start, range->end, > PAGE_SHIFT); > if (ret) > diff --git a/mm/hmm.c b/mm/hmm.c > index 547002f56a163d..8796447299023c 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range, > * Track updates to the CPU page table see include/linux/hmm.h > */ > int hmm_range_register(struct hmm_range *range, > - struct mm_struct *mm, > + struct hmm_mirror *mirror, > unsigned long start, > unsigned long end, > unsigned page_shift) > { > unsigned long mask = ((1UL << page_shift) - 1UL); > - struct hmm *hmm; > + struct hmm *hmm = mirror->hmm; > > range->valid = false; > range->hmm = NULL; > @@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range, > range->start = start; > range->end = end; > > - hmm = hmm_get_or_create(mm); > - if (!hmm) > - return -EFAULT; > - > /* Check if hmm_mm_destroy() was call. */ > - if (hmm->mm == NULL || hmm->dead) { > - hmm_put(hmm); > + if (hmm->mm == NULL || hmm->dead) > return -EFAULT; > - } > + > + range->hmm = hmm; > + kref_get(&hmm->kref); > > /* Initialize range to track CPU page table updates. */ > mutex_lock(&hmm->lock); > I'm not a qualified Nouveau reviewer, but this looks obviously correct, so: Reviewed-by: John Hubbard <jhubbard@nvidia.com> thanks,
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: > From: Jason Gunthorpe <jgg@mellanox.com> > > Ralph observes that hmm_range_register() can only be called by a driver > while a mirror is registered. Make this clear in the API by passing in the > mirror structure as a parameter. > > This also simplifies understanding the lifetime model for struct hmm, as > the hmm pointer must be valid as part of a registered mirror so all we > need in hmm_register_range() is a simple kref_get. > > Suggested-by: Ralph Campbell <rcampbell@nvidia.com> > Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> You might CC Ben for the nouveau part. CC: Ben Skeggs <bskeggs@redhat.com> Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> > --- > v2 > - Include the oneline patch to nouveau_svm.c > --- > drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- > include/linux/hmm.h | 7 ++++--- > mm/hmm.c | 15 ++++++--------- > 3 files changed, 11 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c > index 93ed43c413f0bb..8c92374afcf227 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_svm.c > +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c > @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) > range.values = nouveau_svm_pfn_values; > range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; > again: > - ret = hmm_vma_fault(&range, true); > + ret = hmm_vma_fault(&svmm->mirror, &range, true); > if (ret == 0) { > mutex_lock(&svmm->mutex); > if (!hmm_vma_range_done(&range)) { > diff --git a/include/linux/hmm.h b/include/linux/hmm.h > index 688c5ca7068795..2d519797cb134a 100644 > --- a/include/linux/hmm.h > +++ b/include/linux/hmm.h > @@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) > * Please see Documentation/vm/hmm.rst for how to use the range API. > */ > int hmm_range_register(struct hmm_range *range, > - struct mm_struct *mm, > + struct hmm_mirror *mirror, > unsigned long start, > unsigned long end, > unsigned page_shift); > @@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) > } > > /* This is a temporary helper to avoid merge conflict between trees. */ > -static inline int hmm_vma_fault(struct hmm_range *range, bool block) > +static inline int hmm_vma_fault(struct hmm_mirror *mirror, > + struct hmm_range *range, bool block) > { > long ret; > > @@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) > range->default_flags = 0; > range->pfn_flags_mask = -1UL; > > - ret = hmm_range_register(range, range->vma->vm_mm, > + ret = hmm_range_register(range, mirror, > range->start, range->end, > PAGE_SHIFT); > if (ret) > diff --git a/mm/hmm.c b/mm/hmm.c > index 547002f56a163d..8796447299023c 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range, > * Track updates to the CPU page table see include/linux/hmm.h > */ > int hmm_range_register(struct hmm_range *range, > - struct mm_struct *mm, > + struct hmm_mirror *mirror, > unsigned long start, > unsigned long end, > unsigned page_shift) > { > unsigned long mask = ((1UL << page_shift) - 1UL); > - struct hmm *hmm; > + struct hmm *hmm = mirror->hmm; > > range->valid = false; > range->hmm = NULL; > @@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range, > range->start = start; > range->end = end; > > - hmm = hmm_get_or_create(mm); > - if (!hmm) > - return -EFAULT; > - > /* Check if hmm_mm_destroy() was call. */ > - if (hmm->mm == NULL || hmm->dead) { > - hmm_put(hmm); > + if (hmm->mm == NULL || hmm->dead) > return -EFAULT; > - } > + > + range->hmm = hmm; > + kref_get(&hmm->kref); > > /* Initialize range to track CPU page table updates. */ > mutex_lock(&hmm->lock); >
On Thu, Jun 06, 2019 at 03:44:29PM -0300, Jason Gunthorpe wrote: > From: Jason Gunthorpe <jgg@mellanox.com> > > Ralph observes that hmm_range_register() can only be called by a driver > while a mirror is registered. Make this clear in the API by passing in the > mirror structure as a parameter. > > This also simplifies understanding the lifetime model for struct hmm, as > the hmm pointer must be valid as part of a registered mirror so all we > need in hmm_register_range() is a simple kref_get. > > Suggested-by: Ralph Campbell <rcampbell@nvidia.com> > Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> > --- > v2 > - Include the oneline patch to nouveau_svm.c > --- > drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- > include/linux/hmm.h | 7 ++++--- > mm/hmm.c | 15 ++++++--------- > 3 files changed, 11 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c > index 93ed43c413f0bb..8c92374afcf227 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_svm.c > +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c > @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) > range.values = nouveau_svm_pfn_values; > range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; > again: > - ret = hmm_vma_fault(&range, true); > + ret = hmm_vma_fault(&svmm->mirror, &range, true); > if (ret == 0) { > mutex_lock(&svmm->mutex); > if (!hmm_vma_range_done(&range)) { > diff --git a/include/linux/hmm.h b/include/linux/hmm.h > index 688c5ca7068795..2d519797cb134a 100644 > --- a/include/linux/hmm.h > +++ b/include/linux/hmm.h > @@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) > * Please see Documentation/vm/hmm.rst for how to use the range API. > */ > int hmm_range_register(struct hmm_range *range, > - struct mm_struct *mm, > + struct hmm_mirror *mirror, > unsigned long start, > unsigned long end, > unsigned page_shift); > @@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) > } > > /* This is a temporary helper to avoid merge conflict between trees. */ > -static inline int hmm_vma_fault(struct hmm_range *range, bool block) > +static inline int hmm_vma_fault(struct hmm_mirror *mirror, > + struct hmm_range *range, bool block) > { > long ret; > > @@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) > range->default_flags = 0; > range->pfn_flags_mask = -1UL; > > - ret = hmm_range_register(range, range->vma->vm_mm, > + ret = hmm_range_register(range, mirror, > range->start, range->end, > PAGE_SHIFT); > if (ret) > diff --git a/mm/hmm.c b/mm/hmm.c > index 547002f56a163d..8796447299023c 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range, > * Track updates to the CPU page table see include/linux/hmm.h > */ > int hmm_range_register(struct hmm_range *range, > - struct mm_struct *mm, > + struct hmm_mirror *mirror, > unsigned long start, > unsigned long end, > unsigned page_shift) > { > unsigned long mask = ((1UL << page_shift) - 1UL); > - struct hmm *hmm; > + struct hmm *hmm = mirror->hmm; > > range->valid = false; > range->hmm = NULL; > @@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range, > range->start = start; > range->end = end; > > - hmm = hmm_get_or_create(mm); > - if (!hmm) > - return -EFAULT; > - > /* Check if hmm_mm_destroy() was call. */ > - if (hmm->mm == NULL || hmm->dead) { > - hmm_put(hmm); > + if (hmm->mm == NULL || hmm->dead) > return -EFAULT; > - } > + > + range->hmm = hmm; I don't think you need this assignment here. In the code below (right after the mutext_lock()) it is set already. And looks like it remains that way after the end of the series. Reviewed-by: Ira Weiny <ira.weiny@intel.com> > + kref_get(&hmm->kref); > > /* Initialize range to track CPU page table updates. */ > mutex_lock(&hmm->lock); > -- > 2.21.0 >
On 6/7/19 11:24 AM, Ralph Campbell wrote: > > On 6/6/19 11:44 AM, Jason Gunthorpe wrote: >> From: Jason Gunthorpe <jgg@mellanox.com> >> >> Ralph observes that hmm_range_register() can only be called by a driver >> while a mirror is registered. Make this clear in the API by passing in >> the >> mirror structure as a parameter. >> >> This also simplifies understanding the lifetime model for struct hmm, as >> the hmm pointer must be valid as part of a registered mirror so all we >> need in hmm_register_range() is a simple kref_get. >> >> Suggested-by: Ralph Campbell <rcampbell@nvidia.com> >> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> > > You might CC Ben for the nouveau part. > CC: Ben Skeggs <bskeggs@redhat.com> > > Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> > > >> --- >> v2 >> - Include the oneline patch to nouveau_svm.c >> --- >> drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- >> include/linux/hmm.h | 7 ++++--- >> mm/hmm.c | 15 ++++++--------- >> 3 files changed, 11 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c >> b/drivers/gpu/drm/nouveau/nouveau_svm.c >> index 93ed43c413f0bb..8c92374afcf227 100644 >> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c >> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c >> @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) >> range.values = nouveau_svm_pfn_values; >> range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; >> again: >> - ret = hmm_vma_fault(&range, true); >> + ret = hmm_vma_fault(&svmm->mirror, &range, true); >> if (ret == 0) { >> mutex_lock(&svmm->mutex); >> if (!hmm_vma_range_done(&range)) { >> diff --git a/include/linux/hmm.h b/include/linux/hmm.h >> index 688c5ca7068795..2d519797cb134a 100644 >> --- a/include/linux/hmm.h >> +++ b/include/linux/hmm.h >> @@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct >> hmm_mirror *mirror) >> * Please see Documentation/vm/hmm.rst for how to use the range API. >> */ >> int hmm_range_register(struct hmm_range *range, >> - struct mm_struct *mm, >> + struct hmm_mirror *mirror, >> unsigned long start, >> unsigned long end, >> unsigned page_shift); >> @@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct >> hmm_range *range) >> } >> /* This is a temporary helper to avoid merge conflict between trees. */ >> -static inline int hmm_vma_fault(struct hmm_range *range, bool block) >> +static inline int hmm_vma_fault(struct hmm_mirror *mirror, >> + struct hmm_range *range, bool block) >> { >> long ret; >> @@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range >> *range, bool block) >> range->default_flags = 0; >> range->pfn_flags_mask = -1UL; >> - ret = hmm_range_register(range, range->vma->vm_mm, >> + ret = hmm_range_register(range, mirror, >> range->start, range->end, >> PAGE_SHIFT); >> if (ret) >> diff --git a/mm/hmm.c b/mm/hmm.c >> index 547002f56a163d..8796447299023c 100644 >> --- a/mm/hmm.c >> +++ b/mm/hmm.c >> @@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range, >> * Track updates to the CPU page table see include/linux/hmm.h >> */ >> int hmm_range_register(struct hmm_range *range, >> - struct mm_struct *mm, >> + struct hmm_mirror *mirror, >> unsigned long start, >> unsigned long end, >> unsigned page_shift) >> { >> unsigned long mask = ((1UL << page_shift) - 1UL); >> - struct hmm *hmm; >> + struct hmm *hmm = mirror->hmm; >> range->valid = false; >> range->hmm = NULL; >> @@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range, >> range->start = start; >> range->end = end; >> - hmm = hmm_get_or_create(mm); >> - if (!hmm) >> - return -EFAULT; >> - >> /* Check if hmm_mm_destroy() was call. */ >> - if (hmm->mm == NULL || hmm->dead) { >> - hmm_put(hmm); >> + if (hmm->mm == NULL || hmm->dead) >> return -EFAULT; >> - } >> + >> + range->hmm = hmm; >> + kref_get(&hmm->kref); >> /* Initialize range to track CPU page table updates. */ >> mutex_lock(&hmm->lock); >> I forgot to add that I think you can delete the duplicate "range->hmm = hmm;" here between the mutex_lock/unlock.
FYI, I very much disagree with the direction this is moving. struct hmm_mirror literally is a trivial duplication of the mmu_notifiers. All these drivers should just use the mmu_notifiers directly for the mirroring part instead of building a thing wrapper that adds nothing but helping to manage the lifetime of struct hmm, which shouldn't exist to start with.
On Fri, Jun 07, 2019 at 03:39:06PM -0700, Ralph Campbell wrote: > > > + range->hmm = hmm; > > > + kref_get(&hmm->kref); > > > /* Initialize range to track CPU page table updates. */ > > > mutex_lock(&hmm->lock); > > > > > I forgot to add that I think you can delete the duplicate > "range->hmm = hmm;" > here between the mutex_lock/unlock. Done, thanks Jason
On Sat, Jun 08, 2019 at 01:54:25AM -0700, Christoph Hellwig wrote: > FYI, I very much disagree with the direction this is moving. > > struct hmm_mirror literally is a trivial duplication of the > mmu_notifiers. All these drivers should just use the mmu_notifiers > directly for the mirroring part instead of building a thing wrapper > that adds nothing but helping to manage the lifetime of struct hmm, > which shouldn't exist to start with. Christoph: What do you think about this sketch below? It would replace the hmm_range/mirror/etc with a different way to build the same locking scheme using some optional helpers linked to the mmu notifier? (just a sketch, still needs a lot more thinking) Jason From 5a91d17bc3b8fcaa685abddaaae5c5aea6f82dca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@mellanox.com> Date: Tue, 11 Jun 2019 16:33:33 -0300 Subject: [PATCH] RFC mm: Provide helpers to implement the common mmu_notifier locking Many users of mmu_notifiers require a read/write lock that is write locked during the invalidate_range_start/end period to protect against a parallel thread reading the page tables while another thread is invalidating them. kvm uses a collision-retry lock built with something like a sequence count, and many mmu_notifiers users have copied this approach with various levels of success. Provide a common set of helpers that build a sleepable read side lock using a collision retry scheme. The general usage pattern is: driver pagefault(): struct mmu_invlock_state st = MMU_INVLOCK_STATE_INIT; again: mmu_invlock_write_start_and_lock(&driver->mn, &st) /* read vmas and page data under mmap_sem */ /* maybe sleep */ take_lock(&driver->lock); if (mn_invlock_end_write_and_unlock(&driver->mn, &st)) { unlock(&driver->lock); goto again; } /* make data visible to the device */ /* does not sleep */ unlock(&driver->lock); The driver is responsible to provide the 'driver->lock', which is the same lock it must hold during invalidate_range_start. By holding this lock the sequence count is fully locked, and invalidations are prevented, so it is safe to make the work visible to the device. Since it is possible for this to live lock it uses the write side of the mmap_sem to create a slow path if there are repeated collisions. This is based off the design of the hmm_range and the RDMA ODP locking scheme, with some additional refinements. Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> --- include/linux/mmu_notifier.h | 83 ++++++++++++++++++++++++++++++++++++ mm/mmu_notifier.c | 71 ++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6ad9..0417f9452f2a09 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -6,6 +6,7 @@ #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/srcu.h> +#include <linux/sched.h> struct mmu_notifier; struct mmu_notifier_ops; @@ -227,8 +228,90 @@ struct mmu_notifier_ops { struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; + + /* + * mmu_invlock is a set of helpers to allow the caller to provide a + * read/write lock scheme where the write side of the lock is held + * between invalidate_start -> end, and the read side can be obtained + * on some other thread. This is a common usage pattern for mmu + * notifier users that want to lock against changes to the mmu. + */ + struct mm_struct *mm; + unsigned int active_invalidates; + seqcount_t invalidate_seq; + wait_queue_head_t wq; }; +struct mmu_invlock_state +{ + unsigned long timeout; + unsigned int update_seq; + bool write_locked; +}; + +#define MMU_INVLOCK_STATE_INIT {.timeout = msecs_to_jiffies(1000)} + +// FIXME: needs a seqcount helper +static inline bool is_locked_seqcount(const seqcount_t *s) +{ + return s->sequence & 1; +} + +void mmu_invlock_write_start_and_lock(struct mmu_notifier *mn, + struct mmu_invlock_state *st); +bool mmu_invlock_write_end(struct mmu_notifier *mn); + +/** + * mmu_invlock_inv_start - Call during invalidate_range_start + * @mn - mmu_notifier + * @lock - True if the supplied range is interesting and should cause the + * write side of the lock lock to be held. + * + * Updates the locking state as part of the invalidate_range_start callback. + * This must be called under a user supplied lock, and it must be called for + * every invalidate_range_start. + */ +static inline void mmu_invlock_inv_start(struct mmu_notifier *mn, bool lock) +{ + if (lock && !mn->active_invalidates) + write_seqcount_begin(&mn->invalidate_seq); + mn->active_invalidates++; +} + +/** + * mmu_invlock_inv_start - Call during invalidate_range_start + * @mn - mmu_notifier + * + * Updates the locking state as part of the invalidate_range_start callback. + * This must be called under a user supplied lock, and it must be called for + * every invalidate_range_end. + */ +static inline void mmu_invlock_inv_end(struct mmu_notifier *mn) +{ + mn->active_invalidates++; + if (!mn->active_invalidates && + is_locked_seqcount(&mn->invalidate_seq)) { + write_seqcount_end(&mn->invalidate_seq); + wake_up_all(&mn->wq); + } +} + +/** + * mmu_invlock_write_needs_retry - Check if the write lock has collided + * @mn - mmu_notifier + * @st - lock state set by mmu_invlock_write_start_and_lock() + * + * The nlock uses a collision retry scheme for the fast path. If a parallel + * invalidate has collided with the lock then it should be restarted again + * from mmu_invlock_write_start_and_lock() + */ +static inline bool mmu_invlock_write_needs_retry(struct mmu_notifier *mn, + struct mmu_invlock_state *st) +{ + return !st->write_locked && + read_seqcount_retry(&mn->invalidate_seq, st->update_seq); +} + static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->mmu_notifier_mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index ee36068077b6e5..3db8cdd7211285 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -247,6 +247,11 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); + mn->mm = mm; + mn->active_invalidates = 0; + seqcount_init(&mn->invalidate_seq); + init_waitqueue_head(&mn->wq); + ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -405,3 +410,69 @@ mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) return range->vma->vm_flags & VM_READ; } EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); + +/** + * mm_invlock_start_write_and_lock - Start a read critical section + * @mn - mmu_notifier + * @st - lock state set initialized by MMU_INVLOCK_STATE_INIT + * + * This should be called with the mmap sem unlocked. It will wait for any + * parallel invalidations to complete and return with the mmap_sem locked. The + * mmap_sem may be locked for read or write. + * + * The critical section must always be ended by + * mn_invlock_end_write_and_unlock(). + */ +void mm_invlock_start_write_and_lock(struct mmu_notifier *mn, struct mmu_invlock_state *st) +{ + long ret; + + if (st->timeout == 0) + goto write_out; + + ret = wait_event_timeout( + mn->wq, !is_locked_seqcount(&mn->invalidate_seq), st->timeout); + if (ret == 0) + goto write_out; + + if (ret == 1) + st->timeout = 0; + else + st->timeout = ret; + down_read(&mn->mm->mmap_sem); + return; + +write_out: + /* + * If we ran out of time then fall back to using the mmap_sem write + * side to block concurrent invalidations. The seqcount is an + * optimization to try and avoid this expensive lock. + */ + down_write(&mn->mm->mmap_sem); + st->write_locked = true; +} +EXPORT_SYMBOL_GPL(mm_invlock_start_write_and_lock); + +/** + * mn_invlock_end_write_and_unlock - End a read critical section + * @mn - mmu_notifier + * @st - lock state set by mmu_invlock_write_start_and_lock() + * + * This completes the read side critical section. If it returns false the + * caller must call mm_invlock_start_write_and_lock again. Upon success the + * mmap_sem is unlocked. + * + * The caller must hold the same lock that is held while calling + * mmu_invlock_inv_start() + */ +bool mn_invlock_end_write_and_unlock(struct mmu_notifier *mn, + struct mmu_invlock_state *st) +{ + if (st->write_locked) { + up_write(&mn->mm->mmap_sem); + return true; + } + up_read(&mn->mm->mmap_sem); + return mmu_invlock_write_needs_retry(mn, st); +} +EXPORT_SYMBOL_GPL(mn_invlock_end_write_and_unlock);
On Tue, Jun 11, 2019 at 04:44:31PM -0300, Jason Gunthorpe wrote: > On Sat, Jun 08, 2019 at 01:54:25AM -0700, Christoph Hellwig wrote: > > FYI, I very much disagree with the direction this is moving. > > > > struct hmm_mirror literally is a trivial duplication of the > > mmu_notifiers. All these drivers should just use the mmu_notifiers > > directly for the mirroring part instead of building a thing wrapper > > that adds nothing but helping to manage the lifetime of struct hmm, > > which shouldn't exist to start with. > > Christoph: What do you think about this sketch below? > > It would replace the hmm_range/mirror/etc with a different way to > build the same locking scheme using some optional helpers linked to > the mmu notifier? > > (just a sketch, still needs a lot more thinking) I like the idea. A few nitpicks: Can we avoid having to store the mm in struct mmu_notifier? I think we could just easily pass it as a parameter to the helpers. The write lock case of mm_invlock_start_write_and_lock is probably worth factoring into separate helper? I can see cases where drivers want to just use it directly if they need to force getting the lock without the chance of a long wait.
On Wed, Jun 12, 2019 at 12:12:34AM -0700, Christoph Hellwig wrote: > On Tue, Jun 11, 2019 at 04:44:31PM -0300, Jason Gunthorpe wrote: > > On Sat, Jun 08, 2019 at 01:54:25AM -0700, Christoph Hellwig wrote: > > > FYI, I very much disagree with the direction this is moving. > > > > > > struct hmm_mirror literally is a trivial duplication of the > > > mmu_notifiers. All these drivers should just use the mmu_notifiers > > > directly for the mirroring part instead of building a thing wrapper > > > that adds nothing but helping to manage the lifetime of struct hmm, > > > which shouldn't exist to start with. > > > > Christoph: What do you think about this sketch below? > > > > It would replace the hmm_range/mirror/etc with a different way to > > build the same locking scheme using some optional helpers linked to > > the mmu notifier? > > > > (just a sketch, still needs a lot more thinking) > > I like the idea. A few nitpicks: Can we avoid having to store the > mm in struct mmu_notifier? I think we could just easily pass it as a > parameter to the helpers. Yes, but I think any driver that needs to use this API will have to hold the 'struct mm_struct' and the 'struct mmu_notifier' together (ie ODP does this in ib_ucontext_per_mm), so if we put it in the notifier then it is trivially available everwhere it is needed, and the mmu_notifier code takes care of the lifetime for the driver. > The write lock case of mm_invlock_start_write_and_lock is probably > worth factoring into separate helper? I can see cases where drivers > want to just use it directly if they need to force getting the lock > without the chance of a long wait. The entire purpose of the invlock is to avoid getting the write lock on mmap_sem as a fast path - if the driver wishes to use mmap_sem locking only then it should just do so directly and forget about the invlock. Note that this patch is just an API sketch, I haven't fully checked that the range_start/end are actually always called under mmap_sem, and I already found that release is not. So there will need to be some preperatory adjustments before we can use down_write(mmap_sem) as a locking strategy here. Thanks, Jason
On Wed, Jun 12, 2019 at 08:41:25AM -0300, Jason Gunthorpe wrote: > > I like the idea. A few nitpicks: Can we avoid having to store the > > mm in struct mmu_notifier? I think we could just easily pass it as a > > parameter to the helpers. > > Yes, but I think any driver that needs to use this API will have to > hold the 'struct mm_struct' and the 'struct mmu_notifier' together (ie > ODP does this in ib_ucontext_per_mm), so if we put it in the notifier > then it is trivially available everwhere it is needed, and the > mmu_notifier code takes care of the lifetime for the driver. True. Well, maybe keep it for now at least. > The entire purpose of the invlock is to avoid getting the write lock > on mmap_sem as a fast path - if the driver wishes to use mmap_sem > locking only then it should just do so directly and forget about the > invlock. May worry here is that there migh be cases where the driver needs to expedite the action, in which case jumping straight to the write lock. But again we can probably skip this for now and see if it really ends up being needed.
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 93ed43c413f0bb..8c92374afcf227 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) range.values = nouveau_svm_pfn_values; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; again: - ret = hmm_vma_fault(&range, true); + ret = hmm_vma_fault(&svmm->mirror, &range, true); if (ret == 0) { mutex_lock(&svmm->mutex); if (!hmm_vma_range_done(&range)) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 688c5ca7068795..2d519797cb134a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) * Please see Documentation/vm/hmm.rst for how to use the range API. */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift); @@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) } /* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_range *range, bool block) +static inline int hmm_vma_fault(struct hmm_mirror *mirror, + struct hmm_range *range, bool block) { long ret; @@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->default_flags = 0; range->pfn_flags_mask = -1UL; - ret = hmm_range_register(range, range->vma->vm_mm, + ret = hmm_range_register(range, mirror, range->start, range->end, PAGE_SHIFT); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index 547002f56a163d..8796447299023c 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range, * Track updates to the CPU page table see include/linux/hmm.h */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); - struct hmm *hmm; + struct hmm *hmm = mirror->hmm; range->valid = false; range->hmm = NULL; @@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - hmm = hmm_get_or_create(mm); - if (!hmm) - return -EFAULT; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) { - hmm_put(hmm); + if (hmm->mm == NULL || hmm->dead) return -EFAULT; - } + + range->hmm = hmm; + kref_get(&hmm->kref); /* Initialize range to track CPU page table updates. */ mutex_lock(&hmm->lock);