diff mbox

[05/36] HMM: introduce heterogeneous memory management v3.

Message ID 1432236705-4209-6-git-send-email-j.glisse@gmail.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Jerome Glisse May 21, 2015, 7:31 p.m. UTC
From: Jérôme Glisse <jglisse@redhat.com>

This patch only introduce core HMM functions for registering a new
mirror and stopping a mirror as well as HMM device registering and
unregistering.

The lifecycle of HMM object is handled differently then the one of
mmu_notifier because unlike mmu_notifier there can be concurrent
call from both mm code to HMM code and/or from device driver code
to HMM code. Moreover lifetime of HMM can be uncorrelated from the
lifetime of the process that is being mirror (GPU might take longer
time to cleanup).

Changed since v1:
  - Updated comment of hmm_device_register().

Changed since v2:
  - Expose struct hmm for easy access to mm struct.
  - Simplify hmm_mirror_register() arguments.
  - Removed the device name.
  - Refcount the mirror struct internaly to HMM allowing to get
    rid of the srcu and making the device driver callback error
    handling simpler.
  - Safe to call several time hmm_mirror_unregister().
  - Rework the mmu_notifier unregistration and release callback.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Jatin Kumar <jakumar@nvidia.com>
cc: <linux-rdma@vger.kernel.org>
---
 MAINTAINERS              |   7 +
 include/linux/hmm.h      | 164 +++++++++++++++++++++
 include/linux/mm.h       |  11 ++
 include/linux/mm_types.h |  14 ++
 kernel/fork.c            |   2 +
 mm/Kconfig               |  15 ++
 mm/Makefile              |   1 +
 mm/hmm.c                 | 370 +++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 584 insertions(+)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

Comments

Aneesh Kumar K.V May 27, 2015, 5:50 a.m. UTC | #1
j.glisse@gmail.com writes:

> From: Jérôme Glisse <jglisse@redhat.com>
>
> This patch only introduce core HMM functions for registering a new
> mirror and stopping a mirror as well as HMM device registering and
> unregistering.
>
> The lifecycle of HMM object is handled differently then the one of
> mmu_notifier because unlike mmu_notifier there can be concurrent
> call from both mm code to HMM code and/or from device driver code
> to HMM code. Moreover lifetime of HMM can be uncorrelated from the
> lifetime of the process that is being mirror (GPU might take longer
> time to cleanup).
>

......

> +struct hmm_device_ops {
> +	/* release() - mirror must stop using the address space.
> +	 *
> +	 * @mirror: The mirror that link process address space with the device.
> +	 *
> +	 * When this is call, device driver must kill all device thread using

s/call/called, ?

> +	 * this mirror. Also, this callback is the last thing call by HMM and
> +	 * HMM will not access the mirror struct after this call (ie no more
> +	 * dereference of it so it is safe for the device driver to free it).
> +	 * It is call either from :
> +	 *   - mm dying (all process using this mm exiting).
> +	 *   - hmm_mirror_unregister() (if no other thread holds a reference)
> +	 *   - outcome of some device error reported by any of the device
> +	 *     callback against that mirror.
> +	 */
> +	void (*release)(struct hmm_mirror *mirror);
> +};
> +
> +
> +/* struct hmm - per mm_struct HMM states.
> + *
> + * @mm: The mm struct this hmm is associated with.
> + * @mirrors: List of all mirror for this mm (one per device).
> + * @vm_end: Last valid address for this mm (exclusive).
> + * @kref: Reference counter.
> + * @rwsem: Serialize the mirror list modifications.
> + * @mmu_notifier: The mmu_notifier of this mm.
> + * @rcu: For delayed cleanup call from mmu_notifier.release() callback.
> + *
> + * For each process address space (mm_struct) there is one and only one hmm
> + * struct. hmm functions will redispatch to each devices the change made to
> + * the process address space.
> + *
> + * Device driver must not access this structure other than for getting the
> + * mm pointer.
> + */

.....

>  #ifndef AT_VECTOR_SIZE_ARCH
>  #define AT_VECTOR_SIZE_ARCH 0
>  #endif
> @@ -451,6 +455,16 @@ struct mm_struct {
>  #ifdef CONFIG_MMU_NOTIFIER
>  	struct mmu_notifier_mm *mmu_notifier_mm;
>  #endif
> +#ifdef CONFIG_HMM
> +	/*
> +	 * hmm always register an mmu_notifier we rely on mmu notifier to keep
> +	 * refcount on mm struct as well as forbiding registering hmm on a
> +	 * dying mm
> +	 *
> +	 * This field is set with mmap_sem old in write mode.

s/old/held/ ?


> +	 */
> +	struct hmm *hmm;
> +#endif
>  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
>  	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
>  #endif
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 0e0ae9a..4083be7 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -27,6 +27,7 @@
>  #include <linux/binfmts.h>
>  #include <linux/mman.h>
>  #include <linux/mmu_notifier.h>
> +#include <linux/hmm.h>
>  #include <linux/fs.h>
>  #include <linux/mm.h>
>  #include <linux/vmacache.h>
> @@ -597,6 +598,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
>  	mm_init_aio(mm);
>  	mm_init_owner(mm, p);
>  	mmu_notifier_mm_init(mm);
> +	hmm_mm_init(mm);
>  	clear_tlb_flush_pending(mm);
>  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
>  	mm->pmd_huge_pte = NULL;
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 52ffb86..189e48f 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -653,3 +653,18 @@ config DEFERRED_STRUCT_PAGE_INIT
>  	  when kswapd starts. This has a potential performance impact on
>  	  processes running early in the lifetime of the systemm until kswapd
>  	  finishes the initialisation.
> +
> +if STAGING
> +config HMM
> +	bool "Enable heterogeneous memory management (HMM)"
> +	depends on MMU
> +	select MMU_NOTIFIER
> +	select GENERIC_PAGE_TABLE

What is GENERIC_PAGE_TABLE ?

> +	default n
> +	help
> +	  Heterogeneous memory management provide infrastructure for a device
> +	  to mirror a process address space into an hardware mmu or into any
> +	  things supporting pagefault like event.
> +
> +	  If unsure, say N to disable hmm.

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jerome Glisse May 27, 2015, 2:38 p.m. UTC | #2
On Wed, May 27, 2015 at 11:20:05AM +0530, Aneesh Kumar K.V wrote:
> j.glisse@gmail.com writes:

Noted your grammar fixes.

> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 52ffb86..189e48f 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -653,3 +653,18 @@ config DEFERRED_STRUCT_PAGE_INIT
> >  	  when kswapd starts. This has a potential performance impact on
> >  	  processes running early in the lifetime of the systemm until kswapd
> >  	  finishes the initialisation.
> > +
> > +if STAGING
> > +config HMM
> > +	bool "Enable heterogeneous memory management (HMM)"
> > +	depends on MMU
> > +	select MMU_NOTIFIER
> > +	select GENERIC_PAGE_TABLE
> 
> What is GENERIC_PAGE_TABLE ?

Let over of when patch 0006 what a seperate feature that was introduced
before this patch. I failed to remove that chunk. Just ignore it.

Cheers,
Jérôme
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Hairgrove June 8, 2015, 7:40 p.m. UTC | #3
On Thu, 21 May 2015, j.glisse@gmail.com wrote:

> From: Jérôme Glisse <jglisse@redhat.com>
>
> This patch only introduce core HMM functions for registering a new
> mirror and stopping a mirror as well as HMM device registering and
> unregistering.
>
> [...]
>
> +/* struct hmm_device_operations - HMM device operation callback
> + */
> +struct hmm_device_ops {
> +	/* release() - mirror must stop using the address space.
> +	 *
> +	 * @mirror: The mirror that link process address space with the device.
> +	 *
> +	 * When this is call, device driver must kill all device thread using
> +	 * this mirror. Also, this callback is the last thing call by HMM and
> +	 * HMM will not access the mirror struct after this call (ie no more
> +	 * dereference of it so it is safe for the device driver to free it).
> +	 * It is call either from :
> +	 *   - mm dying (all process using this mm exiting).
> +	 *   - hmm_mirror_unregister() (if no other thread holds a reference)
> +	 *   - outcome of some device error reported by any of the device
> +	 *     callback against that mirror.
> +	 */
> +	void (*release)(struct hmm_mirror *mirror);
> +};

The comment that ->release is called when the mm dies doesn't match the
implementation. ->release is only called when the mirror is destroyed, and
that can only happen after the mirror has been unregistered. This may not
happen until after the mm dies.

Is the intent for the driver to get the callback when the mm goes down?
That seems beneficial so the driver can kill whatever's happening on the
device. Otherwise the device may continue operating in a dead address
space until the driver's file gets closed and it unregisters the mirror.


> +static void hmm_mirror_destroy(struct kref *kref)
> +{
> +	struct hmm_device *device;
> +	struct hmm_mirror *mirror;
> +	struct hmm *hmm;
> +
> +	mirror = container_of(kref, struct hmm_mirror, kref);
> +	device = mirror->device;
> +	hmm = mirror->hmm;
> +
> +	mutex_lock(&device->mutex);
> +	list_del_init(&mirror->dlist);
> +	device->ops->release(mirror);
> +	mutex_unlock(&device->mutex);
> +}

The hmm variable is unused. It also probably isn't safe to access at this
point.


> +static void hmm_mirror_kill(struct hmm_mirror *mirror)
> +{
> +	down_write(&mirror->hmm->rwsem);
> +	if (!hlist_unhashed(&mirror->mlist)) {
> +		hlist_del_init(&mirror->mlist);
> +		up_write(&mirror->hmm->rwsem);
> +
> +		hmm_mirror_unref(&mirror);
> +	} else
> +		up_write(&mirror->hmm->rwsem);
> +}

Shouldn't this call hmm_unref? hmm_mirror_register calls hmm_ref but
there's no corresponding hmm_unref when the mirror goes away. As a result
the hmm struct gets leaked and thus so does the entire mm since
mmu_notifier_unregister is never called.

It might also be a good idea to set mirror->hmm = NULL here to prevent
accidental use in say hmm_mirror_destroy.


> +/* hmm_device_unregister() - unregister a device with HMM.
> + *
> + * @device: The hmm_device struct.
> + * Returns: 0 on success or -EBUSY otherwise.
> + *
> + * Call when device driver want to unregister itself with HMM. This will check
> + * that there is no any active mirror and returns -EBUSY if so.
> + */
> +int hmm_device_unregister(struct hmm_device *device)
> +{
> +	mutex_lock(&device->mutex);
> +	if (!list_empty(&device->mirrors)) {
> +		mutex_unlock(&device->mutex);
> +		return -EBUSY;
> +	}
> +	mutex_unlock(&device->mutex);
> +	return 0;
> +}

I assume that the intention is for the caller to spin on
hmm_device_unregister until -EBUSY is no longer returned?

If so, I think there's a race here in the case of mm teardown happening
concurrently with hmm_mirror_unregister. This can happen if the parent
process was forked and exits while the child closes the file, or if the
file is passed to another process and closed last there while the original
process exits.

The upshot is that the hmm_device may still be referenced by another
thread even after hmm_device_unregister returns 0.

The below sequence shows how this might happen. Coming into this, the
mirror's ref count is 2:

Thread A (file close)               Thread B (process exit)
----------------------              ----------------------
                                    hmm_notifier_release
                                      down_write(&hmm->rwsem);
hmm_mirror_unregister
  hmm_mirror_kill
    down_write(&hmm->rwsem);
    // Blocked on thread B
                                      hlist_del_init(&mirror->mlist);
                                      up_write(&hmm->rwsem);

                                      // Thread A unblocked
                                      // Thread B is preempted
    // hlist_unhashed returns 1
    up_write(&hmm->rwsem);

  // Mirror ref goes 2 -> 1
  hmm_mirror_unref(&mirror);

  // hmm_mirror_unregister returns

At this point hmm_mirror_unregister has returned to the caller but the
mirror still is in use by thread B. Since all mirrors have been
unregistered, the driver in thread A is now free to call
hmm_device_unregister.

                                      // Thread B is scheduled

                                      // Mirror ref goes 1 -> 0
                                      hmm_mirror_unref(&mirror);
                                        hmm_mirror_destroy(&mirror)
                                          mutex_lock(&device->mutex);
                                          list_del_init(&mirror->dlist);
                                          device->ops->release(mirror);
                                          mutex_unlock(&device->mutex);

hmm_device_unregister
  mutex_lock(&device->mutex);
  // Device list empty
  mutex_unlock(&device->mutex);
  return 0;
// Caller frees device

Do you agree that this sequence can happen, or am I missing something
which prevents it?

If this can happen, the problem is that the only thing preventing thread A
from freeing the device is that thread B has device->mutex locked. That's
bad, because a lock within a structure cannot be used to control freeing
that structure. The mutex_unlock in thread B may internally still access
the mutex memory even after the atomic operation which unlocks the mutex
and unblocks thread A.

This can't be solved by having the driver wait for the ->release mirror
callback before it calls hmm_device_unregister, because the race happens
after that point.

A kref on the device itself might solve this, but the core issue IMO is
that hmm_mirror_unregister doesn't wait for hmm_notifier_release to
complete before returning. It feels like hmm_mirror_unregister wants to do
a synchronize_srcu on the mmu_notifier srcu. Is that possible?

Whatever the resolution, it would be useful for the block comments of
hmm_mirror_unregister and hmm_device_unregister to describe the
expectations on the caller and what the caller is guaranteed as far as
mirror and device lifetimes go.

Thanks,
Mark
Jerome Glisse June 8, 2015, 9:17 p.m. UTC | #4
On Mon, Jun 08, 2015 at 12:40:18PM -0700, Mark Hairgrove wrote:
> 
> 
> On Thu, 21 May 2015, j.glisse@gmail.com wrote:
> 
> > From: Jérôme Glisse <jglisse@redhat.com>
> >
> > This patch only introduce core HMM functions for registering a new
> > mirror and stopping a mirror as well as HMM device registering and
> > unregistering.
> >
> > [...]
> >
> > +/* struct hmm_device_operations - HMM device operation callback
> > + */
> > +struct hmm_device_ops {
> > +	/* release() - mirror must stop using the address space.
> > +	 *
> > +	 * @mirror: The mirror that link process address space with the device.
> > +	 *
> > +	 * When this is call, device driver must kill all device thread using
> > +	 * this mirror. Also, this callback is the last thing call by HMM and
> > +	 * HMM will not access the mirror struct after this call (ie no more
> > +	 * dereference of it so it is safe for the device driver to free it).
> > +	 * It is call either from :
> > +	 *   - mm dying (all process using this mm exiting).
> > +	 *   - hmm_mirror_unregister() (if no other thread holds a reference)
> > +	 *   - outcome of some device error reported by any of the device
> > +	 *     callback against that mirror.
> > +	 */
> > +	void (*release)(struct hmm_mirror *mirror);
> > +};
> 
> The comment that ->release is called when the mm dies doesn't match the
> implementation. ->release is only called when the mirror is destroyed, and
> that can only happen after the mirror has been unregistered. This may not
> happen until after the mm dies.
> 
> Is the intent for the driver to get the callback when the mm goes down?
> That seems beneficial so the driver can kill whatever's happening on the
> device. Otherwise the device may continue operating in a dead address
> space until the driver's file gets closed and it unregisters the mirror.

This was the intent before merging free & release. I guess i need to
reinstate the free versus release callback. Sadly the lifetime for HMM
is more complex than mmu_notifier as we intend the mirror struct to
be embedded into a driver private struct.

> 
> > +static void hmm_mirror_destroy(struct kref *kref)
> > +{
> > +	struct hmm_device *device;
> > +	struct hmm_mirror *mirror;
> > +	struct hmm *hmm;
> > +
> > +	mirror = container_of(kref, struct hmm_mirror, kref);
> > +	device = mirror->device;
> > +	hmm = mirror->hmm;
> > +
> > +	mutex_lock(&device->mutex);
> > +	list_del_init(&mirror->dlist);
> > +	device->ops->release(mirror);
> > +	mutex_unlock(&device->mutex);
> > +}
> 
> The hmm variable is unused. It also probably isn't safe to access at this
> point.

hmm_unref(hmm); was lost somewhere probably in a rebase and it is safe to
access hmm here.

> 
> 
> > +static void hmm_mirror_kill(struct hmm_mirror *mirror)
> > +{
> > +	down_write(&mirror->hmm->rwsem);
> > +	if (!hlist_unhashed(&mirror->mlist)) {
> > +		hlist_del_init(&mirror->mlist);
> > +		up_write(&mirror->hmm->rwsem);
> > +
> > +		hmm_mirror_unref(&mirror);
> > +	} else
> > +		up_write(&mirror->hmm->rwsem);
> > +}
> 
> Shouldn't this call hmm_unref? hmm_mirror_register calls hmm_ref but
> there's no corresponding hmm_unref when the mirror goes away. As a result
> the hmm struct gets leaked and thus so does the entire mm since
> mmu_notifier_unregister is never called.
> 
> It might also be a good idea to set mirror->hmm = NULL here to prevent
> accidental use in say hmm_mirror_destroy.

No, hmm_mirror_destroy must be the one doing the hmm_unref(hmm)

> 
> 
> > +/* hmm_device_unregister() - unregister a device with HMM.
> > + *
> > + * @device: The hmm_device struct.
> > + * Returns: 0 on success or -EBUSY otherwise.
> > + *
> > + * Call when device driver want to unregister itself with HMM. This will check
> > + * that there is no any active mirror and returns -EBUSY if so.
> > + */
> > +int hmm_device_unregister(struct hmm_device *device)
> > +{
> > +	mutex_lock(&device->mutex);
> > +	if (!list_empty(&device->mirrors)) {
> > +		mutex_unlock(&device->mutex);
> > +		return -EBUSY;
> > +	}
> > +	mutex_unlock(&device->mutex);
> > +	return 0;
> > +}
> 
> I assume that the intention is for the caller to spin on
> hmm_device_unregister until -EBUSY is no longer returned?
> 
> If so, I think there's a race here in the case of mm teardown happening
> concurrently with hmm_mirror_unregister. This can happen if the parent
> process was forked and exits while the child closes the file, or if the
> file is passed to another process and closed last there while the original
> process exits.
> 
> The upshot is that the hmm_device may still be referenced by another
> thread even after hmm_device_unregister returns 0.
> 
> The below sequence shows how this might happen. Coming into this, the
> mirror's ref count is 2:
> 
> Thread A (file close)               Thread B (process exit)
> ----------------------              ----------------------
>                                     hmm_notifier_release
>                                       down_write(&hmm->rwsem);
> hmm_mirror_unregister
>   hmm_mirror_kill
>     down_write(&hmm->rwsem);
>     // Blocked on thread B
>                                       hlist_del_init(&mirror->mlist);
>                                       up_write(&hmm->rwsem);
> 
>                                       // Thread A unblocked
>                                       // Thread B is preempted
>     // hlist_unhashed returns 1
>     up_write(&hmm->rwsem);
> 
>   // Mirror ref goes 2 -> 1
>   hmm_mirror_unref(&mirror);
> 
>   // hmm_mirror_unregister returns
> 
> At this point hmm_mirror_unregister has returned to the caller but the
> mirror still is in use by thread B. Since all mirrors have been
> unregistered, the driver in thread A is now free to call
> hmm_device_unregister.
> 
>                                       // Thread B is scheduled
> 
>                                       // Mirror ref goes 1 -> 0
>                                       hmm_mirror_unref(&mirror);
>                                         hmm_mirror_destroy(&mirror)
>                                           mutex_lock(&device->mutex);
>                                           list_del_init(&mirror->dlist);
>                                           device->ops->release(mirror);
>                                           mutex_unlock(&device->mutex);
> 
> hmm_device_unregister
>   mutex_lock(&device->mutex);
>   // Device list empty
>   mutex_unlock(&device->mutex);
>   return 0;
> // Caller frees device
> 
> Do you agree that this sequence can happen, or am I missing something
> which prevents it?

Can't happen because child have mm->hmm = NULL ie only one hmm per mm
and hmm is tie to only one mm. It is the responsability of the device
driver to make sure same apply to private reference to the hmm mirror
struct ie hmm_mirror should never be tie to a private file struct.

> 
> If this can happen, the problem is that the only thing preventing thread A
> from freeing the device is that thread B has device->mutex locked. That's
> bad, because a lock within a structure cannot be used to control freeing
> that structure. The mutex_unlock in thread B may internally still access
> the mutex memory even after the atomic operation which unlocks the mutex
> and unblocks thread A.
> 
> This can't be solved by having the driver wait for the ->release mirror
> callback before it calls hmm_device_unregister, because the race happens
> after that point.
> 
> A kref on the device itself might solve this, but the core issue IMO is
> that hmm_mirror_unregister doesn't wait for hmm_notifier_release to
> complete before returning. It feels like hmm_mirror_unregister wants to do
> a synchronize_srcu on the mmu_notifier srcu. Is that possible?

I guess i need to revisit once again the whole lifetime issue.

> 
> Whatever the resolution, it would be useful for the block comments of
> hmm_mirror_unregister and hmm_device_unregister to describe the
> expectations on the caller and what the caller is guaranteed as far as
> mirror and device lifetimes go.

Yes i need to fix comment and add again spend time on lifetime. I
obviously completely screw that up in that version of the patchset.

Cheers,
Jérôme
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Hairgrove June 9, 2015, 1:54 a.m. UTC | #5
On Mon, 8 Jun 2015, Jerome Glisse wrote:

> On Mon, Jun 08, 2015 at 12:40:18PM -0700, Mark Hairgrove wrote:
> >
> >
> > On Thu, 21 May 2015, j.glisse@gmail.com wrote:
> >
> > > From: Jérôme Glisse <jglisse@redhat.com>
> > >
> > > This patch only introduce core HMM functions for registering a new
> > > mirror and stopping a mirror as well as HMM device registering and
> > > unregistering.
> > >
> > > [...]
> > >
> > > +/* struct hmm_device_operations - HMM device operation callback
> > > + */
> > > +struct hmm_device_ops {
> > > +	/* release() - mirror must stop using the address space.
> > > +	 *
> > > +	 * @mirror: The mirror that link process address space with the device.
> > > +	 *
> > > +	 * When this is call, device driver must kill all device thread using
> > > +	 * this mirror. Also, this callback is the last thing call by HMM and
> > > +	 * HMM will not access the mirror struct after this call (ie no more
> > > +	 * dereference of it so it is safe for the device driver to free it).
> > > +	 * It is call either from :
> > > +	 *   - mm dying (all process using this mm exiting).
> > > +	 *   - hmm_mirror_unregister() (if no other thread holds a reference)
> > > +	 *   - outcome of some device error reported by any of the device
> > > +	 *     callback against that mirror.
> > > +	 */
> > > +	void (*release)(struct hmm_mirror *mirror);
> > > +};
> >
> > The comment that ->release is called when the mm dies doesn't match the
> > implementation. ->release is only called when the mirror is destroyed, and
> > that can only happen after the mirror has been unregistered. This may not
> > happen until after the mm dies.
> >
> > Is the intent for the driver to get the callback when the mm goes down?
> > That seems beneficial so the driver can kill whatever's happening on the
> > device. Otherwise the device may continue operating in a dead address
> > space until the driver's file gets closed and it unregisters the mirror.
>
> This was the intent before merging free & release. I guess i need to
> reinstate the free versus release callback. Sadly the lifetime for HMM
> is more complex than mmu_notifier as we intend the mirror struct to
> be embedded into a driver private struct.

Can you clarify how that's different from mmu_notifiers? Those are also
embedded into a driver-owned struct.

Is the goal to allow calling hmm_mirror_unregister from within the "mm is
dying" HMM callback? I don't know whether that's really necessary as long
as there's some association between the driver files and the mirrors.


> > If so, I think there's a race here in the case of mm teardown happening
> > concurrently with hmm_mirror_unregister.
> >
> > [...]
> >
> > Do you agree that this sequence can happen, or am I missing something
> > which prevents it?
>
> Can't happen because child have mm->hmm = NULL ie only one hmm per mm
> and hmm is tie to only one mm. It is the responsability of the device
> driver to make sure same apply to private reference to the hmm mirror
> struct ie hmm_mirror should never be tie to a private file struct.

It's useful for the driver to have some association between files and
mirrors. If the file is closed prior to process exit we would like to
unregister the mirror, otherwise it will persist until process teardown.
The association doesn't have to be 1:1 but having the files ref count the
mirror or something would be useful.

But even if we assume no association at all between files and mirrors, are
you sure that prevents the race? The driver may choose to unregister the
hmm_device at any point once its files are closed. In the case of module
unload the device unregister can't be prevented. If mm teardown hasn't
happened yet mirrors may still be active and registered on that
hmm_device. The driver thus has to first call hmm_mirror_unregister on all
active mirrors, then call hmm_device_unregister. mm teardown of those
mirrors may trigger at any point in this sequence, so we're right back to
that race.
Jerome Glisse June 9, 2015, 3:56 p.m. UTC | #6
On Mon, Jun 08, 2015 at 06:54:29PM -0700, Mark Hairgrove wrote:
> On Mon, 8 Jun 2015, Jerome Glisse wrote:
> > On Mon, Jun 08, 2015 at 12:40:18PM -0700, Mark Hairgrove wrote:
> > > On Thu, 21 May 2015, j.glisse@gmail.com wrote:
> > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > >
> > > > This patch only introduce core HMM functions for registering a new
> > > > mirror and stopping a mirror as well as HMM device registering and
> > > > unregistering.
> > > >
> > > > [...]
> > > >
> > > > +/* struct hmm_device_operations - HMM device operation callback
> > > > + */
> > > > +struct hmm_device_ops {
> > > > +	/* release() - mirror must stop using the address space.
> > > > +	 *
> > > > +	 * @mirror: The mirror that link process address space with the device.
> > > > +	 *
> > > > +	 * When this is call, device driver must kill all device thread using
> > > > +	 * this mirror. Also, this callback is the last thing call by HMM and
> > > > +	 * HMM will not access the mirror struct after this call (ie no more
> > > > +	 * dereference of it so it is safe for the device driver to free it).
> > > > +	 * It is call either from :
> > > > +	 *   - mm dying (all process using this mm exiting).
> > > > +	 *   - hmm_mirror_unregister() (if no other thread holds a reference)
> > > > +	 *   - outcome of some device error reported by any of the device
> > > > +	 *     callback against that mirror.
> > > > +	 */
> > > > +	void (*release)(struct hmm_mirror *mirror);
> > > > +};
> > >
> > > The comment that ->release is called when the mm dies doesn't match the
> > > implementation. ->release is only called when the mirror is destroyed, and
> > > that can only happen after the mirror has been unregistered. This may not
> > > happen until after the mm dies.
> > >
> > > Is the intent for the driver to get the callback when the mm goes down?
> > > That seems beneficial so the driver can kill whatever's happening on the
> > > device. Otherwise the device may continue operating in a dead address
> > > space until the driver's file gets closed and it unregisters the mirror.
> >
> > This was the intent before merging free & release. I guess i need to
> > reinstate the free versus release callback. Sadly the lifetime for HMM
> > is more complex than mmu_notifier as we intend the mirror struct to
> > be embedded into a driver private struct.
> 
> Can you clarify how that's different from mmu_notifiers? Those are also
> embedded into a driver-owned struct.

For HMM you want to be able to kill a mirror from HMM, you might have kernel
thread call inside HMM with a mirror (outside any device file lifetime) ...
The mirror is not only use at register & unregister, there is a lot more thing
you can call using the HMM mirror struct.

So the HMM mirror lifetime as a result is more complex, it can not simply be
free from the mmu_notifier_release callback or randomly. It needs to be
refcounted. The mmu_notifier code assume that the mmu_notifier struct is
embedded inside a struct that has a lifetime properly synchronize with the
mm. For HMM mirror this is not something that sounds like a good idea as there
is too many way to get it wrong.

So idea of HMM mirror is that it can out last the mm lifetime but the HMM
struct can not. So you have hmm_mirror <~> hmm <-> mm and the mirror can be
"unlink" and have different lifetime from the hmm that itself has same life
time as mm.

> Is the goal to allow calling hmm_mirror_unregister from within the "mm is
> dying" HMM callback? I don't know whether that's really necessary as long
> as there's some association between the driver files and the mirrors.

No this is not a goal and i actualy forbid that.

> 
> > > If so, I think there's a race here in the case of mm teardown happening
> > > concurrently with hmm_mirror_unregister.
> > >
> > > [...]
> > >
> > > Do you agree that this sequence can happen, or am I missing something
> > > which prevents it?
> >
> > Can't happen because child have mm->hmm = NULL ie only one hmm per mm
> > and hmm is tie to only one mm. It is the responsability of the device
> > driver to make sure same apply to private reference to the hmm mirror
> > struct ie hmm_mirror should never be tie to a private file struct.
> 
> It's useful for the driver to have some association between files and
> mirrors. If the file is closed prior to process exit we would like to
> unregister the mirror, otherwise it will persist until process teardown.
> The association doesn't have to be 1:1 but having the files ref count the
> mirror or something would be useful.

This is allowed, i might have put strong word here, but you can associate
with a file struct. What you can not do is use the mirror from a different
process ie one with a different mm struct as mirror is linked to a single
mm. So on fork there is no callback to update the private file struct, when
the device file is duplicated (well just refcount inc) against a different
process. This is something you need to be carefull in your driver. Inside
the dummy driver i abuse that to actually test proper behavior of HMM but
it should not be use as an example.

> 
> But even if we assume no association at all between files and mirrors, are
> you sure that prevents the race? The driver may choose to unregister the
> hmm_device at any point once its files are closed. In the case of module
> unload the device unregister can't be prevented. If mm teardown hasn't
> happened yet mirrors may still be active and registered on that
> hmm_device. The driver thus has to first call hmm_mirror_unregister on all
> active mirrors, then call hmm_device_unregister. mm teardown of those
> mirrors may trigger at any point in this sequence, so we're right back to
> that race.

So when device driver unload the first thing it needs to do is kill all of
its context ie all of its HMM mirror (unregister them) by doing so it will
make sure that there can be no more call to any of its functions.

The race with mm teardown does not exist as what matter for mm teardown is
the fact that the mirror is on the struct hmm mirrors list or not. Either
the device driver is first to remove the mirror from the list or it is the
mm teardown but this is lock protected so only one thread can do it.

The issue you pointed is really about decoupling the lifetime of the mirror
context (ie hardware thread that use the mirror) and the lifetime of the
structure that embedded the hmm_mirror struct. The device driver will care
about the second while everything else will only really care about the
first. The second tells you when you know for sure that there will be no
more callback to your device driver code. The first only tells you that
there should be no more activity associated with that mirror but some thread
might still hold a reference on the underlying struct.


Hope this clarify design and motivation behind the hmm_mirror vs hmm struct
lifetime.


Cheers,
Jérôme
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Hairgrove June 10, 2015, 3:33 a.m. UTC | #7
On Tue, 9 Jun 2015, Jerome Glisse wrote:

> On Mon, Jun 08, 2015 at 06:54:29PM -0700, Mark Hairgrove wrote:
> > Can you clarify how that's different from mmu_notifiers? Those are also
> > embedded into a driver-owned struct.
> 
> For HMM you want to be able to kill a mirror from HMM, you might have kernel
> thread call inside HMM with a mirror (outside any device file lifetime) ...
> The mirror is not only use at register & unregister, there is a lot more thing
> you can call using the HMM mirror struct.
> 
> So the HMM mirror lifetime as a result is more complex, it can not simply be
> free from the mmu_notifier_release callback or randomly. It needs to be
> refcounted.

Sure, there are driver -> HMM calls like hmm_mirror_fault that 
mmu_notifiers don't have, but I don't understand why that fundamentally 
makes HMM mirror lifetimes more complex. Decoupling hmm_mirror lifetime 
from mm lifetime adds complexity too.

> The mmu_notifier code assume that the mmu_notifier struct is
> embedded inside a struct that has a lifetime properly synchronize with the
> mm. For HMM mirror this is not something that sounds like a good idea as there
> is too many way to get it wrong.

What kind of synchronization with the mm are you referring to here? 
Clients of mmu_notifiers don't have to do anything as far as I know. 
They're guaranteed that the mm won't go away because each registered 
notifier bumps mm_count.

> So idea of HMM mirror is that it can out last the mm lifetime but the HMM
> struct can not. So you have hmm_mirror <~> hmm <-> mm and the mirror can be
> "unlink" and have different lifetime from the hmm that itself has same life
> time as mm.

Per the earlier discussion hmm_mirror_destroy is missing a call to 
hmm_unref. If that's added back I don't understand how the mirror can 
persist past the hmm struct. The mirror can be unlinked from hmm's list, 
yes, but that doesn't mean that hmm/mm can be torn down. The hmm/mm 
structs will stick around until hmm_destroy since that does the 
mmu_notifier_unregister. hmm_destroy can't be called until the last 
hmm_mirror_destroy.

Doesn't that mean that hmm/mm are guaranteed to be allocated until the 
last hmm_mirror_unregister? That sounds like a good guarantee to make.


> 
> > Is the goal to allow calling hmm_mirror_unregister from within the "mm is
> > dying" HMM callback? I don't know whether that's really necessary as long
> > as there's some association between the driver files and the mirrors.
> 
> No this is not a goal and i actualy forbid that.
> 
> > 
> > > > If so, I think there's a race here in the case of mm teardown happening
> > > > concurrently with hmm_mirror_unregister.
> > > >
> > > > [...]
> > > >
> > > > Do you agree that this sequence can happen, or am I missing something
> > > > which prevents it?
> > >
> > > Can't happen because child have mm->hmm = NULL ie only one hmm per mm
> > > and hmm is tie to only one mm. It is the responsability of the device
> > > driver to make sure same apply to private reference to the hmm mirror
> > > struct ie hmm_mirror should never be tie to a private file struct.
> > 
> > It's useful for the driver to have some association between files and
> > mirrors. If the file is closed prior to process exit we would like to
> > unregister the mirror, otherwise it will persist until process teardown.
> > The association doesn't have to be 1:1 but having the files ref count the
> > mirror or something would be useful.
> 
> This is allowed, i might have put strong word here, but you can associate
> with a file struct. What you can not do is use the mirror from a different
> process ie one with a different mm struct as mirror is linked to a single
> mm. So on fork there is no callback to update the private file struct, when
> the device file is duplicated (well just refcount inc) against a different
> process. This is something you need to be carefull in your driver. Inside
> the dummy driver i abuse that to actually test proper behavior of HMM but
> it should not be use as an example.

So to confirm, on all file operations from user space the driver is 
expected to check that current->mm matches the mm associated with the 
struct file's hmm_mirror?

On file->release the driver still ought to call hmm_mirror_unregister 
regardless of whether the mms match, otherwise we'll never tear down the 
mirror. That means we're not saved from the race condition because 
hmm_mirror_unregister can happen in one thread while hmm_notifier_release 
might be happening in another thread.


> > 
> > But even if we assume no association at all between files and mirrors, are
> > you sure that prevents the race? The driver may choose to unregister the
> > hmm_device at any point once its files are closed. In the case of module
> > unload the device unregister can't be prevented. If mm teardown hasn't
> > happened yet mirrors may still be active and registered on that
> > hmm_device. The driver thus has to first call hmm_mirror_unregister on all
> > active mirrors, then call hmm_device_unregister. mm teardown of those
> > mirrors may trigger at any point in this sequence, so we're right back to
> > that race.
> 
> So when device driver unload the first thing it needs to do is kill all of
> its context ie all of its HMM mirror (unregister them) by doing so it will
> make sure that there can be no more call to any of its functions.

When is the driver expected to call hmm_mirror_unregister? Is it file 
close, module unload, or some other time?

If it's file close, there's no need to unregister anything on module 
unload because the files were all closed already.

If it's module unload, then the mirrors and mms all get leaked until that 
point.

We're exposed to the race in both cases.

> 
> The race with mm teardown does not exist as what matter for mm teardown is
> the fact that the mirror is on the struct hmm mirrors list or not. Either
> the device driver is first to remove the mirror from the list or it is the
> mm teardown but this is lock protected so only one thread can do it.
> 

Agreed, removing the mirror from the list is not a "race" in the classical 
sense. The true race is between hmm_notifier_release's device mutex_unlock 
(process exit) and post-hmm_device_unregister device mutex free (driver 
close/unload). What I meant is that in order to expose that race you first 
need one thread to call hmm_mirror_unregister while another thread is in 
hmm_notifier_release.

Regardless of where hmm_mirror_unregister is called (file close, module 
unload, etc) it can happen concurrently with hmm_notifier_release so we're 
exposed to this race.


> The issue you pointed is really about decoupling the lifetime of the mirror
> context (ie hardware thread that use the mirror) and the lifetime of the
> structure that embedded the hmm_mirror struct. The device driver will care
> about the second while everything else will only really care about the
> first. The second tells you when you know for sure that there will be no
> more callback to your device driver code. The first only tells you that
> there should be no more activity associated with that mirror but some thread
> might still hold a reference on the underlying struct.
> 
> 
> Hope this clarify design and motivation behind the hmm_mirror vs hmm struct
> lifetime.
> 
> 
> Cheers,
> Jérôme
>
Jerome Glisse June 10, 2015, 3:42 p.m. UTC | #8
On Tue, Jun 09, 2015 at 08:33:12PM -0700, Mark Hairgrove wrote:
> 
> 
> On Tue, 9 Jun 2015, Jerome Glisse wrote:
> 
> > On Mon, Jun 08, 2015 at 06:54:29PM -0700, Mark Hairgrove wrote:
> > > Can you clarify how that's different from mmu_notifiers? Those are also
> > > embedded into a driver-owned struct.
> > 
> > For HMM you want to be able to kill a mirror from HMM, you might have kernel
> > thread call inside HMM with a mirror (outside any device file lifetime) ...
> > The mirror is not only use at register & unregister, there is a lot more thing
> > you can call using the HMM mirror struct.
> > 
> > So the HMM mirror lifetime as a result is more complex, it can not simply be
> > free from the mmu_notifier_release callback or randomly. It needs to be
> > refcounted.
> 
> Sure, there are driver -> HMM calls like hmm_mirror_fault that 
> mmu_notifiers don't have, but I don't understand why that fundamentally 
> makes HMM mirror lifetimes more complex. Decoupling hmm_mirror lifetime 
> from mm lifetime adds complexity too.

Driver->HMM calls can happen from random kernel thread thus you need to
garanty that hmm_mirror can not go away. More over you can have CPU MM
call into HMM outside of mmu_notifier. Basicly you can get to HMM code
by many different code path, unlike any of the current mmu_notifier.

So refcounting is necessary as otherwise the device driver might decide
to unregister and free the mirror while some other kernel thread is
about to dereference the exact same mirror. Synchronization with mmu
notifier srcu will not be enough in the case of page fault on remote
memory for instance. Other case too.

> 
> > The mmu_notifier code assume that the mmu_notifier struct is
> > embedded inside a struct that has a lifetime properly synchronize with the
> > mm. For HMM mirror this is not something that sounds like a good idea as there
> > is too many way to get it wrong.
> 
> What kind of synchronization with the mm are you referring to here? 
> Clients of mmu_notifiers don't have to do anything as far as I know. 
> They're guaranteed that the mm won't go away because each registered 
> notifier bumps mm_count.

So for all current user afaict (kvm, xen, intel, radeon) tie to a file,
(sgi gru) tie to vma, (iommu) tie to mm. Which means this is all properly
synchronize with lifetime of mm (ignoring the fork case).

The struct that is tie to mmu_notifier for all of them can be accessed
only through one code path (ioctl for most of them).

> 
> > So idea of HMM mirror is that it can out last the mm lifetime but the HMM
> > struct can not. So you have hmm_mirror <~> hmm <-> mm and the mirror can be
> > "unlink" and have different lifetime from the hmm that itself has same life
> > time as mm.
> 
> Per the earlier discussion hmm_mirror_destroy is missing a call to 
> hmm_unref. If that's added back I don't understand how the mirror can 
> persist past the hmm struct. The mirror can be unlinked from hmm's list, 
> yes, but that doesn't mean that hmm/mm can be torn down. The hmm/mm 
> structs will stick around until hmm_destroy since that does the 
> mmu_notifier_unregister. hmm_destroy can't be called until the last 
> hmm_mirror_destroy.
> 
> Doesn't that mean that hmm/mm are guaranteed to be allocated until the 
> last hmm_mirror_unregister? That sounds like a good guarantee to make.

Like said, just ignore current code it is utterly broken in so many way
when it comes to lifetime. I screw that part badly when reworking the
patchset, i was focusing on other part.

I fixed that in my tree, i am waiting for more review on other part as
anyway the lifetime thing is easy to rework/fix.

http://cgit.freedesktop.org/~glisse/linux/log/?h=hmm

[...]
> > > > > If so, I think there's a race here in the case of mm teardown happening
> > > > > concurrently with hmm_mirror_unregister.
> > > > >
> > > > > [...]
> > > > >
> > > > > Do you agree that this sequence can happen, or am I missing something
> > > > > which prevents it?
> > > >
> > > > Can't happen because child have mm->hmm = NULL ie only one hmm per mm
> > > > and hmm is tie to only one mm. It is the responsability of the device
> > > > driver to make sure same apply to private reference to the hmm mirror
> > > > struct ie hmm_mirror should never be tie to a private file struct.
> > > 
> > > It's useful for the driver to have some association between files and
> > > mirrors. If the file is closed prior to process exit we would like to
> > > unregister the mirror, otherwise it will persist until process teardown.
> > > The association doesn't have to be 1:1 but having the files ref count the
> > > mirror or something would be useful.
> > 
> > This is allowed, i might have put strong word here, but you can associate
> > with a file struct. What you can not do is use the mirror from a different
> > process ie one with a different mm struct as mirror is linked to a single
> > mm. So on fork there is no callback to update the private file struct, when
> > the device file is duplicated (well just refcount inc) against a different
> > process. This is something you need to be carefull in your driver. Inside
> > the dummy driver i abuse that to actually test proper behavior of HMM but
> > it should not be use as an example.
> 
> So to confirm, on all file operations from user space the driver is 
> expected to check that current->mm matches the mm associated with the 
> struct file's hmm_mirror?

Well you might have a valid usecase for that, just be aware that
anything your driver do with the hmm_mirror will actually impact
the mm of the parent. Which i assume is not what you want.

I would actualy thought that what you want is having a way to find
hmm_mirror using both device file & mm as a key. Otherwise you can
not really use HMM with process that like to fork themself. Which
is a valid usecase to me. For instance process start using HMM
through your driver, decide to fork itself and to also use HMM
through your driver inside its child.

> 
> On file->release the driver still ought to call hmm_mirror_unregister 
> regardless of whether the mms match, otherwise we'll never tear down the 
> mirror. That means we're not saved from the race condition because 
> hmm_mirror_unregister can happen in one thread while hmm_notifier_release 
> might be happening in another thread.

Again there is no race the mirror list is the synchronization point and
it is protected by a lock. So either hmm_mirror_unregister() wins or the
other thread hmm_notifier_release()

> > > But even if we assume no association at all between files and mirrors, are
> > > you sure that prevents the race? The driver may choose to unregister the
> > > hmm_device at any point once its files are closed. In the case of module
> > > unload the device unregister can't be prevented. If mm teardown hasn't
> > > happened yet mirrors may still be active and registered on that
> > > hmm_device. The driver thus has to first call hmm_mirror_unregister on all
> > > active mirrors, then call hmm_device_unregister. mm teardown of those
> > > mirrors may trigger at any point in this sequence, so we're right back to
> > > that race.
> > 
> > So when device driver unload the first thing it needs to do is kill all of
> > its context ie all of its HMM mirror (unregister them) by doing so it will
> > make sure that there can be no more call to any of its functions.
> 
> When is the driver expected to call hmm_mirror_unregister? Is it file 
> close, module unload, or some other time?
> 
> If it's file close, there's no need to unregister anything on module 
> unload because the files were all closed already.
> 
> If it's module unload, then the mirrors and mms all get leaked until that 
> point.
> 
> We're exposed to the race in both cases.

You unregister as soon as you want, it is up to your driver to do it,
i do not enforce anything. The only thing i enforce is that you can
not unregister the hmm device driver before all mirror are unregistered
and free.

So yes for device driver you want to unregister when device file is
close (which happens when the process exit).

In both cases there is no race as explain above.

> 
> > 
> > The race with mm teardown does not exist as what matter for mm teardown is
> > the fact that the mirror is on the struct hmm mirrors list or not. Either
> > the device driver is first to remove the mirror from the list or it is the
> > mm teardown but this is lock protected so only one thread can do it.
> > 
> 
> Agreed, removing the mirror from the list is not a "race" in the classical 
> sense. The true race is between hmm_notifier_release's device mutex_unlock 
> (process exit) and post-hmm_device_unregister device mutex free (driver 
> close/unload). What I meant is that in order to expose that race you first 
> need one thread to call hmm_mirror_unregister while another thread is in 
> hmm_notifier_release.
> 
> Regardless of where hmm_mirror_unregister is called (file close, module 
> unload, etc) it can happen concurrently with hmm_notifier_release so we're 
> exposed to this race.

There is no race here, the mirror struct will only be freed once as again
the list is a synchronization point. Whoever remove the mirror from the
list is responsible to drop the list reference.

In the fixed code the only thing that will happen twice is the ->release()
callback. Even that can be work around to garanty it is call only once.

Anyway i do not see anyrace here.

Cheers,
Jérôme
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Hairgrove June 11, 2015, 1:15 a.m. UTC | #9
On Wed, 10 Jun 2015, Jerome Glisse wrote:

> [...]
> 
> Like said, just ignore current code it is utterly broken in so many way
> when it comes to lifetime. I screw that part badly when reworking the
> patchset, i was focusing on other part.
> 
> I fixed that in my tree, i am waiting for more review on other part as
> anyway the lifetime thing is easy to rework/fix.
> 
> http://cgit.freedesktop.org/~glisse/linux/log/?h=hmm
> 

Ok, I'm working through the other patches so I'll check the updates out 
once I've made it through. My primary interest in this discussion is 
making sure we know the plan for mirror and device lifetimes.


> > So to confirm, on all file operations from user space the driver is 
> > expected to check that current->mm matches the mm associated with the 
> > struct file's hmm_mirror?
> 
> Well you might have a valid usecase for that, just be aware that
> anything your driver do with the hmm_mirror will actually impact
> the mm of the parent. Which i assume is not what you want.
> 
> I would actualy thought that what you want is having a way to find
> hmm_mirror using both device file & mm as a key. Otherwise you can
> not really use HMM with process that like to fork themself. Which
> is a valid usecase to me. For instance process start using HMM
> through your driver, decide to fork itself and to also use HMM
> through your driver inside its child.

Agreed, that sounds reasonable, and the use case is valid. I was digging 
into this to make sure we don't prevent that.


> > 
> > On file->release the driver still ought to call hmm_mirror_unregister 
> > regardless of whether the mms match, otherwise we'll never tear down the 
> > mirror. That means we're not saved from the race condition because 
> > hmm_mirror_unregister can happen in one thread while hmm_notifier_release 
> > might be happening in another thread.
> 
> Again there is no race the mirror list is the synchronization point and
> it is protected by a lock. So either hmm_mirror_unregister() wins or the
> other thread hmm_notifier_release()

Yes, I agree. That's not the race I'm worried about. I'm worried about a 
race on the device lifetime, but in order to hit that one first 
hmm_notifier_release must take the lock and remove the mirror from the 
list before hmm_mirror_unregister does it. That's why I brought it up.


> 
> You unregister as soon as you want, it is up to your driver to do it,
> i do not enforce anything. The only thing i enforce is that you can
> not unregister the hmm device driver before all mirror are unregistered
> and free.
> 
> So yes for device driver you want to unregister when device file is
> close (which happens when the process exit).

Sounds good.


> 
> There is no race here, the mirror struct will only be freed once as again
> the list is a synchronization point. Whoever remove the mirror from the
> list is responsible to drop the list reference.
> 
> In the fixed code the only thing that will happen twice is the ->release()
> callback. Even that can be work around to garanty it is call only once.
> 
> Anyway i do not see anyrace here.
> 

The mirror lifetime is fine. The problem I see is with the device lifetime 
on a multi-core system. Imagine this sequence:

- On CPU1 the mm associated with the mirror is going down
- On CPU2 the driver unregisters the mirror then the device

When this happens, the last device mutex_unlock on CPU1 is the only thing 
preventing the free of the device in CPU2. That doesn't work, as described 
in this thread: https://lkml.org/lkml/2013/12/2/997

Here's the full sequence again with mutex_unlock split apart. Hopefully 
this shows the device_unregister problem more clearly:

CPU1 (mm release)                   CPU2 (driver)
----------------------              ----------------------
hmm_notifier_release
  down_write(&hmm->rwsem);
  hlist_del_init(&mirror->mlist);
  up_write(&hmm->rwsem);

  // CPU1 thread is preempted or 
  // something
                                    hmm_mirror_unregister
                                      hmm_mirror_kill
                                        down_write(&hmm->rwsem);
                                        // mirror removed by CPU1 already
                                        // so hlist_unhashed returns 1
                                        up_write(&hmm->rwsem);

                                      hmm_mirror_unref(&mirror);
                                      // Mirror ref now 1

                                      // CPU2 thread is preempted or
                                      // something
// CPU1 thread is scheduled

hmm_mirror_unref(&mirror);
  // Mirror ref now 0, cleanup
  hmm_mirror_destroy(&mirror)
    mutex_lock(&device->mutex);
    list_del_init(&mirror->dlist);
    device->ops->release(mirror);
      kfree(mirror);
                                      // CPU2 thread is scheduled, now
                                      // both CPU1 and CPU2 are running

                                    hmm_device_unregister
                                      mutex_lock(&device->mutex);
                                        mutex_optimistic_spin()
    mutex_unlock(&device->mutex);
      [...]
      __mutex_unlock_common_slowpath
        // CPU2 releases lock
        atomic_set(&lock->count, 1);
                                          // Spinning CPU2 acquires now-
                                          // free lock
                                      // mutex_lock returns
                                      // Device list empty
                                      mutex_unlock(&device->mutex);
                                      return 0;
                                    kfree(hmm_device);
        // CPU1 still accessing 
        // hmm_device->mutex in 
        //__mutex_unlock_common_slowpath
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jerome Glisse June 11, 2015, 2:23 p.m. UTC | #10
On Wed, Jun 10, 2015 at 06:15:08PM -0700, Mark Hairgrove wrote:

[...]
> > There is no race here, the mirror struct will only be freed once as again
> > the list is a synchronization point. Whoever remove the mirror from the
> > list is responsible to drop the list reference.
> > 
> > In the fixed code the only thing that will happen twice is the ->release()
> > callback. Even that can be work around to garanty it is call only once.
> > 
> > Anyway i do not see anyrace here.
> > 
> 
> The mirror lifetime is fine. The problem I see is with the device lifetime 
> on a multi-core system. Imagine this sequence:
> 
> - On CPU1 the mm associated with the mirror is going down
> - On CPU2 the driver unregisters the mirror then the device
> 
> When this happens, the last device mutex_unlock on CPU1 is the only thing 
> preventing the free of the device in CPU2. That doesn't work, as described 
> in this thread: https://lkml.org/lkml/2013/12/2/997
> 
> Here's the full sequence again with mutex_unlock split apart. Hopefully 
> this shows the device_unregister problem more clearly:
> 
> CPU1 (mm release)                   CPU2 (driver)
> ----------------------              ----------------------
> hmm_notifier_release
>   down_write(&hmm->rwsem);
>   hlist_del_init(&mirror->mlist);
>   up_write(&hmm->rwsem);
> 
>   // CPU1 thread is preempted or 
>   // something
>                                     hmm_mirror_unregister
>                                       hmm_mirror_kill
>                                         down_write(&hmm->rwsem);
>                                         // mirror removed by CPU1 already
>                                         // so hlist_unhashed returns 1
>                                         up_write(&hmm->rwsem);
> 
>                                       hmm_mirror_unref(&mirror);
>                                       // Mirror ref now 1
> 
>                                       // CPU2 thread is preempted or
>                                       // something
> // CPU1 thread is scheduled
> 
> hmm_mirror_unref(&mirror);
>   // Mirror ref now 0, cleanup
>   hmm_mirror_destroy(&mirror)
>     mutex_lock(&device->mutex);
>     list_del_init(&mirror->dlist);
>     device->ops->release(mirror);
>       kfree(mirror);
>                                       // CPU2 thread is scheduled, now
>                                       // both CPU1 and CPU2 are running
> 
>                                     hmm_device_unregister
>                                       mutex_lock(&device->mutex);
>                                         mutex_optimistic_spin()
>     mutex_unlock(&device->mutex);
>       [...]
>       __mutex_unlock_common_slowpath
>         // CPU2 releases lock
>         atomic_set(&lock->count, 1);
>                                           // Spinning CPU2 acquires now-
>                                           // free lock
>                                       // mutex_lock returns
>                                       // Device list empty
>                                       mutex_unlock(&device->mutex);
>                                       return 0;
>                                     kfree(hmm_device);
>         // CPU1 still accessing 
>         // hmm_device->mutex in 
>         //__mutex_unlock_common_slowpath

Ok i see the race you are afraid of and really it is an unlikely one
__mutex_unlock_common_slowpath() take a spinlock right after allowing
other to take the mutex, when we are in your scenario there is no
contention on that spinlock so it is taken right away and as there
is no one in the mutex wait list then it goes directly to unlock the
spinlock and return. You can ignore the debug function as if debugging
is enabled than the mutex_lock() would need to also take the spinlock
and thus you would have proper synchronization btw 2 thread thanks to
the mutex.wait_lock.

So basicly while CPU1 is going :
spin_lock(mutex.wait_lock)
if (!list_empty(mutex.wait_list)) {
  // wait_list is empty so branch not taken
}
spin_unlock(mutex.wait_lock)

CPU2 would have to test the mirror list and mutex_unlock and return
before the spin_unlock() of CPU1. This is a tight race, i can add a
synchronize_rcu() to device_unregister after the mutex_unlock() so
that we also add a grace period before the device is potentialy freed
which should make that race completely unlikely.

Moreover for something really bad to happen it would need that the
freed memory to be reallocated right away by some other thread. Which
really sound unlikely unless CPU1 is the slowest of all :)

Cheers,
Jérôme
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Hairgrove June 11, 2015, 10:26 p.m. UTC | #11
On Thu, 11 Jun 2015, Jerome Glisse wrote:

> On Wed, Jun 10, 2015 at 06:15:08PM -0700, Mark Hairgrove wrote:
> 
> [...]
> > > There is no race here, the mirror struct will only be freed once as again
> > > the list is a synchronization point. Whoever remove the mirror from the
> > > list is responsible to drop the list reference.
> > > 
> > > In the fixed code the only thing that will happen twice is the ->release()
> > > callback. Even that can be work around to garanty it is call only once.
> > > 
> > > Anyway i do not see anyrace here.
> > > 
> > 
> > The mirror lifetime is fine. The problem I see is with the device lifetime 
> > on a multi-core system. Imagine this sequence:
> > 
> > - On CPU1 the mm associated with the mirror is going down
> > - On CPU2 the driver unregisters the mirror then the device
> > 
> > When this happens, the last device mutex_unlock on CPU1 is the only thing 
> > preventing the free of the device in CPU2. That doesn't work, as described 
> > in this thread: https://lkml.org/lkml/2013/12/2/997
> > 
> > Here's the full sequence again with mutex_unlock split apart. Hopefully 
> > this shows the device_unregister problem more clearly:
> > 
> > CPU1 (mm release)                   CPU2 (driver)
> > ----------------------              ----------------------
> > hmm_notifier_release
> >   down_write(&hmm->rwsem);
> >   hlist_del_init(&mirror->mlist);
> >   up_write(&hmm->rwsem);
> > 
> >   // CPU1 thread is preempted or 
> >   // something
> >                                     hmm_mirror_unregister
> >                                       hmm_mirror_kill
> >                                         down_write(&hmm->rwsem);
> >                                         // mirror removed by CPU1 already
> >                                         // so hlist_unhashed returns 1
> >                                         up_write(&hmm->rwsem);
> > 
> >                                       hmm_mirror_unref(&mirror);
> >                                       // Mirror ref now 1
> > 
> >                                       // CPU2 thread is preempted or
> >                                       // something
> > // CPU1 thread is scheduled
> > 
> > hmm_mirror_unref(&mirror);
> >   // Mirror ref now 0, cleanup
> >   hmm_mirror_destroy(&mirror)
> >     mutex_lock(&device->mutex);
> >     list_del_init(&mirror->dlist);
> >     device->ops->release(mirror);
> >       kfree(mirror);
> >                                       // CPU2 thread is scheduled, now
> >                                       // both CPU1 and CPU2 are running
> > 
> >                                     hmm_device_unregister
> >                                       mutex_lock(&device->mutex);
> >                                         mutex_optimistic_spin()
> >     mutex_unlock(&device->mutex);
> >       [...]
> >       __mutex_unlock_common_slowpath
> >         // CPU2 releases lock
> >         atomic_set(&lock->count, 1);
> >                                           // Spinning CPU2 acquires now-
> >                                           // free lock
> >                                       // mutex_lock returns
> >                                       // Device list empty
> >                                       mutex_unlock(&device->mutex);
> >                                       return 0;
> >                                     kfree(hmm_device);
> >         // CPU1 still accessing 
> >         // hmm_device->mutex in 
> >         //__mutex_unlock_common_slowpath
> 
> Ok i see the race you are afraid of and really it is an unlikely one
> __mutex_unlock_common_slowpath() take a spinlock right after allowing
> other to take the mutex, when we are in your scenario there is no
> contention on that spinlock so it is taken right away and as there
> is no one in the mutex wait list then it goes directly to unlock the
> spinlock and return. You can ignore the debug function as if debugging
> is enabled than the mutex_lock() would need to also take the spinlock
> and thus you would have proper synchronization btw 2 thread thanks to
> the mutex.wait_lock.
> 
> So basicly while CPU1 is going :
> spin_lock(mutex.wait_lock)
> if (!list_empty(mutex.wait_list)) {
>   // wait_list is empty so branch not taken
> }
> spin_unlock(mutex.wait_lock)
> 
> CPU2 would have to test the mirror list and mutex_unlock and return
> before the spin_unlock() of CPU1. This is a tight race, i can add a
> synchronize_rcu() to device_unregister after the mutex_unlock() so
> that we also add a grace period before the device is potentialy freed
> which should make that race completely unlikely.
> 
> Moreover for something really bad to happen it would need that the
> freed memory to be reallocated right away by some other thread. Which
> really sound unlikely unless CPU1 is the slowest of all :)
> 
> Cheers,
> Jérôme
> 

But CPU1 could get preempted between the atomic_set and the 
spin_lock_mutex, and then it doesn't matter whether or not a grace period 
has elapsed before CPU2 proceeds.

Making race conditions less likely just makes them harder to pinpoint when 
they inevitably appear in the wild. I don't think it makes sense to spend 
any effort in making a race condition less likely, and that thread I 
referenced (https://lkml.org/lkml/2013/12/2/997) is fairly strong evidence 
that fixing this race actually matters. So, I think this race condition 
really needs to be fixed.

One fix is for hmm_mirror_unregister to wait for hmm_notifier_release 
completion between hmm_mirror_kill and hmm_mirror_unref. It can do this by 
calling synchronize_srcu() on the mmu_notifier's srcu. This has the 
benefit that the driver is guaranteed not to get the "mm is dead" callback 
after hmm_mirror_unregister returns.

In fact, are there any callbacks on the mirror that can arrive after 
hmm_mirror_unregister? If so, how will hmm_device_unregister solve them?

From a general standpoint, hmm_device_unregister must perform some kind of 
synchronization to be sure that all mirrors are completely released and 
done and no new callbacks will trigger. Since that has to be true, can't 
that synchronization be moved into hmm_mirror_unregister instead?

If that happens there's no need for a "mirror can be freed" ->release 
callback at all because the driver is guaranteed that a mirror is done 
after hmm_mirror_unregister.
Jerome Glisse June 15, 2015, 2:32 p.m. UTC | #12
On Thu, Jun 11, 2015 at 03:26:46PM -0700, Mark Hairgrove wrote:
> On Thu, 11 Jun 2015, Jerome Glisse wrote:
> > On Wed, Jun 10, 2015 at 06:15:08PM -0700, Mark Hairgrove wrote:

[...]
> > Ok i see the race you are afraid of and really it is an unlikely one
> > __mutex_unlock_common_slowpath() take a spinlock right after allowing
> > other to take the mutex, when we are in your scenario there is no
> > contention on that spinlock so it is taken right away and as there
> > is no one in the mutex wait list then it goes directly to unlock the
> > spinlock and return. You can ignore the debug function as if debugging
> > is enabled than the mutex_lock() would need to also take the spinlock
> > and thus you would have proper synchronization btw 2 thread thanks to
> > the mutex.wait_lock.
> > 
> > So basicly while CPU1 is going :
> > spin_lock(mutex.wait_lock)
> > if (!list_empty(mutex.wait_list)) {
> >   // wait_list is empty so branch not taken
> > }
> > spin_unlock(mutex.wait_lock)
> > 
> > CPU2 would have to test the mirror list and mutex_unlock and return
> > before the spin_unlock() of CPU1. This is a tight race, i can add a
> > synchronize_rcu() to device_unregister after the mutex_unlock() so
> > that we also add a grace period before the device is potentialy freed
> > which should make that race completely unlikely.
> > 
> > Moreover for something really bad to happen it would need that the
> > freed memory to be reallocated right away by some other thread. Which
> > really sound unlikely unless CPU1 is the slowest of all :)
> > 
> > Cheers,
> > Jérôme
> > 
> 
> But CPU1 could get preempted between the atomic_set and the 
> spin_lock_mutex, and then it doesn't matter whether or not a grace period 
> has elapsed before CPU2 proceeds.
> 
> Making race conditions less likely just makes them harder to pinpoint when 
> they inevitably appear in the wild. I don't think it makes sense to spend 
> any effort in making a race condition less likely, and that thread I 
> referenced (https://lkml.org/lkml/2013/12/2/997) is fairly strong evidence 
> that fixing this race actually matters. So, I think this race condition 
> really needs to be fixed.
> 
> One fix is for hmm_mirror_unregister to wait for hmm_notifier_release 
> completion between hmm_mirror_kill and hmm_mirror_unref. It can do this by 
> calling synchronize_srcu() on the mmu_notifier's srcu. This has the 
> benefit that the driver is guaranteed not to get the "mm is dead" callback 
> after hmm_mirror_unregister returns.
> 
> In fact, are there any callbacks on the mirror that can arrive after 
> hmm_mirror_unregister? If so, how will hmm_device_unregister solve them?
> 
> From a general standpoint, hmm_device_unregister must perform some kind of 
> synchronization to be sure that all mirrors are completely released and 
> done and no new callbacks will trigger. Since that has to be true, can't 
> that synchronization be moved into hmm_mirror_unregister instead?
> 
> If that happens there's no need for a "mirror can be freed" ->release 
> callback at all because the driver is guaranteed that a mirror is done 
> after hmm_mirror_unregister.

Well there is no need or 2 callback (relase|stop , free) just one, the
release|stop that is needed. I kind of went halfway last week on this.
I will probably rework that a little to keep just one call and rely on
driver to call hmm_mirror_unregister()

Cheers,
Jérôme
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 78ea7b6..2f2a2be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4730,6 +4730,13 @@  F:	include/uapi/linux/if_hippi.h
 F:	net/802/hippi.c
 F:	drivers/net/hippi/
 
+HMM - Heterogeneous Memory Management
+M:	Jérôme Glisse <jglisse@redhat.com>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	mm/hmm.c
+F:	include/linux/hmm.h
+
 HOST AP DRIVER
 M:	Jouni Malinen <j@w1.fi>
 L:	hostap@shmoo.com (subscribers-only)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 0000000..175a757
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,164 @@ 
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/* This is a heterogeneous memory management (hmm). In a nutshell this provide
+ * an API to mirror a process address on a device which has its own mmu using
+ * its own page table for the process. It supports everything except special
+ * vma.
+ *
+ * Mandatory hardware features :
+ *   - An mmu with pagetable.
+ *   - Read only flag per cpu page.
+ *   - Page fault ie hardware must stop and wait for kernel to service fault.
+ *
+ * Optional hardware features :
+ *   - Dirty bit per cpu page.
+ *   - Access bit per cpu page.
+ *
+ * The hmm code handle all the interfacing with the core kernel mm code and
+ * provide a simple API. It does support migrating system memory to device
+ * memory and handle migration back to system memory on cpu page fault.
+ *
+ * Migrated memory is considered as swaped from cpu and core mm code point of
+ * view.
+ */
+#ifndef _HMM_H
+#define _HMM_H
+
+#ifdef CONFIG_HMM
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/workqueue.h>
+#include <linux/mman.h>
+
+
+struct hmm_device;
+struct hmm_mirror;
+struct hmm;
+
+
+/* hmm_device - Each device must register one and only one hmm_device.
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* struct hmm_device_operations - HMM device operation callback
+ */
+struct hmm_device_ops {
+	/* release() - mirror must stop using the address space.
+	 *
+	 * @mirror: The mirror that link process address space with the device.
+	 *
+	 * When this is call, device driver must kill all device thread using
+	 * this mirror. Also, this callback is the last thing call by HMM and
+	 * HMM will not access the mirror struct after this call (ie no more
+	 * dereference of it so it is safe for the device driver to free it).
+	 * It is call either from :
+	 *   - mm dying (all process using this mm exiting).
+	 *   - hmm_mirror_unregister() (if no other thread holds a reference)
+	 *   - outcome of some device error reported by any of the device
+	 *     callback against that mirror.
+	 */
+	void (*release)(struct hmm_mirror *mirror);
+};
+
+
+/* struct hmm - per mm_struct HMM states.
+ *
+ * @mm: The mm struct this hmm is associated with.
+ * @mirrors: List of all mirror for this mm (one per device).
+ * @vm_end: Last valid address for this mm (exclusive).
+ * @kref: Reference counter.
+ * @rwsem: Serialize the mirror list modifications.
+ * @mmu_notifier: The mmu_notifier of this mm.
+ * @rcu: For delayed cleanup call from mmu_notifier.release() callback.
+ *
+ * For each process address space (mm_struct) there is one and only one hmm
+ * struct. hmm functions will redispatch to each devices the change made to
+ * the process address space.
+ *
+ * Device driver must not access this structure other than for getting the
+ * mm pointer.
+ */
+struct hmm {
+	struct mm_struct	*mm;
+	struct hlist_head	mirrors;
+	unsigned long		vm_end;
+	struct kref		kref;
+	struct rw_semaphore	rwsem;
+	struct mmu_notifier	mmu_notifier;
+	struct rcu_head		rcu;
+};
+
+
+/* struct hmm_device - per device HMM structure
+ *
+ * @dev: Linux device structure pointer.
+ * @ops: The hmm operations callback.
+ * @mirrors: List of all active mirrors for the device.
+ * @mutex: Mutex protecting mirrors list.
+ *
+ * Each device that want to mirror an address space must register one of this
+ * struct (only once per linux device).
+ */
+struct hmm_device {
+	struct device			*dev;
+	const struct hmm_device_ops	*ops;
+	struct list_head		mirrors;
+	struct mutex			mutex;
+};
+
+int hmm_device_register(struct hmm_device *device);
+int hmm_device_unregister(struct hmm_device *device);
+
+
+/* hmm_mirror - device specific mirroring functions.
+ *
+ * Each device that mirror a process has a uniq hmm_mirror struct associating
+ * the process address space with the device. Same process can be mirrored by
+ * several different devices at the same time.
+ */
+
+/* struct hmm_mirror - per device and per mm HMM structure
+ *
+ * @device: The hmm_device struct this hmm_mirror is associated to.
+ * @hmm: The hmm struct this hmm_mirror is associated to.
+ * @kref: Reference counter (private to HMM do not use).
+ * @dlist: List of all hmm_mirror for same device.
+ * @mlist: List of all hmm_mirror for same process.
+ *
+ * Each device that want to mirror an address space must register one of this
+ * struct for each of the address space it wants to mirror. Same device can
+ * mirror several different address space. As well same address space can be
+ * mirror by different devices.
+ */
+struct hmm_mirror {
+	struct hmm_device	*device;
+	struct hmm		*hmm;
+	struct kref		kref;
+	struct list_head	dlist;
+	struct hlist_node	mlist;
+};
+
+int hmm_mirror_register(struct hmm_mirror *mirror);
+void hmm_mirror_unregister(struct hmm_mirror *mirror);
+
+
+#endif /* CONFIG_HMM */
+#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2923a51..cf642d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2199,5 +2199,16 @@  void __init setup_nr_node_ids(void);
 static inline void setup_nr_node_ids(void) {}
 #endif
 
+#ifdef CONFIG_HMM
+static inline void hmm_mm_init(struct mm_struct *mm)
+{
+	mm->hmm = NULL;
+}
+#else /* !CONFIG_HMM */
+static inline void hmm_mm_init(struct mm_struct *mm)
+{
+}
+#endif /* !CONFIG_HMM */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0038ac7..4494f7f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -15,6 +15,10 @@ 
 #include <asm/page.h>
 #include <asm/mmu.h>
 
+#ifdef CONFIG_HMM
+struct hmm;
+#endif
+
 #ifndef AT_VECTOR_SIZE_ARCH
 #define AT_VECTOR_SIZE_ARCH 0
 #endif
@@ -451,6 +455,16 @@  struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
+#ifdef CONFIG_HMM
+	/*
+	 * hmm always register an mmu_notifier we rely on mmu notifier to keep
+	 * refcount on mm struct as well as forbiding registering hmm on a
+	 * dying mm
+	 *
+	 * This field is set with mmap_sem old in write mode.
+	 */
+	struct hmm *hmm;
+#endif
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 0e0ae9a..4083be7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@ 
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
+#include <linux/hmm.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
@@ -597,6 +598,7 @@  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
 	mmu_notifier_mm_init(mm);
+	hmm_mm_init(mm);
 	clear_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
diff --git a/mm/Kconfig b/mm/Kconfig
index 52ffb86..189e48f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -653,3 +653,18 @@  config DEFERRED_STRUCT_PAGE_INIT
 	  when kswapd starts. This has a potential performance impact on
 	  processes running early in the lifetime of the systemm until kswapd
 	  finishes the initialisation.
+
+if STAGING
+config HMM
+	bool "Enable heterogeneous memory management (HMM)"
+	depends on MMU
+	select MMU_NOTIFIER
+	select GENERIC_PAGE_TABLE
+	default n
+	help
+	  Heterogeneous memory management provide infrastructure for a device
+	  to mirror a process address space into an hardware mmu or into any
+	  things supporting pagefault like event.
+
+	  If unsure, say N to disable hmm.
+endif # STAGING
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eae..90ca9c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@  obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_HMM) += hmm.o
diff --git a/mm/hmm.c b/mm/hmm.c
new file mode 100644
index 0000000..e684dd0
--- /dev/null
+++ b/mm/hmm.c
@@ -0,0 +1,370 @@ 
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/* This is the core code for heterogeneous memory management (HMM). HMM intend
+ * to provide helper for mirroring a process address space on a device as well
+ * as allowing migration of data between system memory and device memory refer
+ * as remote memory from here on out.
+ *
+ * Refer to include/linux/hmm.h for further information on general design.
+ */
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/mmu_context.h>
+#include <linux/memcontrol.h>
+#include <linux/hmm.h>
+#include <linux/wait.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+
+#include "internal.h"
+
+static struct mmu_notifier_ops hmm_notifier_ops;
+
+static inline struct hmm_mirror *hmm_mirror_ref(struct hmm_mirror *mirror);
+static inline void hmm_mirror_unref(struct hmm_mirror **mirror);
+
+
+/* hmm - core HMM functions.
+ *
+ * Core HMM functions that deal with all the process mm activities.
+ */
+
+static int hmm_init(struct hmm *hmm)
+{
+	hmm->mm = current->mm;
+	hmm->vm_end = TASK_SIZE;
+	kref_init(&hmm->kref);
+	INIT_HLIST_HEAD(&hmm->mirrors);
+	init_rwsem(&hmm->rwsem);
+
+	/* register notifier */
+	hmm->mmu_notifier.ops = &hmm_notifier_ops;
+	return __mmu_notifier_register(&hmm->mmu_notifier, current->mm);
+}
+
+static int hmm_add_mirror(struct hmm *hmm, struct hmm_mirror *mirror)
+{
+	struct hmm_mirror *tmp;
+
+	down_write(&hmm->rwsem);
+	hlist_for_each_entry(tmp, &hmm->mirrors, mlist)
+		if (tmp->device == mirror->device) {
+			/* Same device can mirror only once. */
+			up_write(&hmm->rwsem);
+			return -EINVAL;
+		}
+	hlist_add_head(&mirror->mlist, &hmm->mirrors);
+	hmm_mirror_ref(mirror);
+	up_write(&hmm->rwsem);
+
+	return 0;
+}
+
+static inline struct hmm *hmm_ref(struct hmm *hmm)
+{
+	if (!hmm || !kref_get_unless_zero(&hmm->kref))
+		return NULL;
+	return hmm;
+}
+
+static void hmm_destroy_delayed(struct rcu_head *rcu)
+{
+	struct hmm *hmm;
+
+	hmm = container_of(rcu, struct hmm, rcu);
+	kfree(hmm);
+}
+
+static void hmm_destroy(struct kref *kref)
+{
+	struct hmm *hmm;
+
+	hmm = container_of(kref, struct hmm, kref);
+	BUG_ON(!hlist_empty(&hmm->mirrors));
+
+	down_write(&hmm->mm->mmap_sem);
+	/* A new hmm might have been register before reaching that point. */
+	if (hmm->mm->hmm == hmm)
+		hmm->mm->hmm = NULL;
+	up_write(&hmm->mm->mmap_sem);
+
+	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
+
+	mmu_notifier_call_srcu(&hmm->rcu, &hmm_destroy_delayed);
+}
+
+static inline struct hmm *hmm_unref(struct hmm *hmm)
+{
+	if (hmm)
+		kref_put(&hmm->kref, hmm_destroy);
+	return NULL;
+}
+
+
+/* hmm_notifier - HMM callback for mmu_notifier tracking change to process mm.
+ *
+ * HMM use use mmu notifier to track change made to process address space.
+ */
+static void hmm_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct hmm *hmm;
+
+	hmm = hmm_ref(container_of(mn, struct hmm, mmu_notifier));
+	if (!hmm)
+		return;
+
+	down_write(&hmm->rwsem);
+	while (hmm->mirrors.first) {
+		struct hmm_mirror *mirror;
+
+		/*
+		 * Here we are holding the mirror reference from the mirror
+		 * list. As list removal is synchronized through spinlock, no
+		 * other thread can assume it holds that reference.
+		 */
+		mirror = hlist_entry(hmm->mirrors.first,
+				     struct hmm_mirror,
+				     mlist);
+		hlist_del_init(&mirror->mlist);
+		up_write(&hmm->rwsem);
+
+		hmm_mirror_unref(&mirror);
+
+		down_write(&hmm->rwsem);
+	}
+	up_write(&hmm->rwsem);
+
+	hmm_unref(hmm);
+}
+
+static struct mmu_notifier_ops hmm_notifier_ops = {
+	.release		= hmm_notifier_release,
+};
+
+
+/* hmm_mirror - per device mirroring functions.
+ *
+ * Each device that mirror a process has a uniq hmm_mirror struct. A process
+ * can be mirror by several devices at the same time.
+ *
+ * Below are all the functions and their helpers use by device driver to mirror
+ * the process address space. Those functions either deals with updating the
+ * device page table (through hmm callback). Or provide helper functions use by
+ * the device driver to fault in range of memory in the device page table.
+ */
+static inline struct hmm_mirror *hmm_mirror_ref(struct hmm_mirror *mirror)
+{
+	if (!mirror || !kref_get_unless_zero(&mirror->kref))
+		return NULL;
+	return mirror;
+}
+
+static void hmm_mirror_destroy(struct kref *kref)
+{
+	struct hmm_device *device;
+	struct hmm_mirror *mirror;
+	struct hmm *hmm;
+
+	mirror = container_of(kref, struct hmm_mirror, kref);
+	device = mirror->device;
+	hmm = mirror->hmm;
+
+	mutex_lock(&device->mutex);
+	list_del_init(&mirror->dlist);
+	device->ops->release(mirror);
+	mutex_unlock(&device->mutex);
+}
+
+static inline void hmm_mirror_unref(struct hmm_mirror **mirror)
+{
+	struct hmm_mirror *tmp = mirror ? *mirror : NULL;
+
+	if (tmp) {
+		*mirror = NULL;
+		kref_put(&tmp->kref, hmm_mirror_destroy);
+	}
+}
+
+/* hmm_mirror_register() - register mirror against current process for a device.
+ *
+ * @mirror: The mirror struct being registered.
+ * Returns: 0 on success or -ENOMEM, -EINVAL on error.
+ *
+ * Call when device driver want to start mirroring a process address space. The
+ * HMM shim will register mmu_notifier and start monitoring process address
+ * space changes. Hence callback to device driver might happen even before this
+ * function return.
+ *
+ * The task device driver want to mirror must be current !
+ *
+ * Only one mirror per mm and hmm_device can be created, it will return NULL if
+ * the hmm_device already has an hmm_mirror for the the mm.
+ */
+int hmm_mirror_register(struct hmm_mirror *mirror)
+{
+	struct mm_struct *mm = current->mm;
+	struct hmm *hmm = NULL;
+	int ret = 0;
+
+	/* Sanity checks. */
+	BUG_ON(!mirror);
+	BUG_ON(!mirror->device);
+	BUG_ON(!mm);
+
+	/*
+	 * Initialize the mirror struct fields, the mlist init and del dance is
+	 * necessary to make the error path easier for driver and for hmm.
+	 */
+	kref_init(&mirror->kref);
+	INIT_HLIST_NODE(&mirror->mlist);
+	INIT_LIST_HEAD(&mirror->dlist);
+	mutex_lock(&mirror->device->mutex);
+	list_add(&mirror->dlist, &mirror->device->mirrors);
+	mutex_unlock(&mirror->device->mutex);
+
+	down_write(&mm->mmap_sem);
+
+	hmm = mm->hmm ? hmm_ref(hmm) : NULL;
+	if (hmm == NULL) {
+		/* no hmm registered yet so register one */
+		hmm = kzalloc(sizeof(*mm->hmm), GFP_KERNEL);
+		if (hmm == NULL) {
+			up_write(&mm->mmap_sem);
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		ret = hmm_init(hmm);
+		if (ret) {
+			up_write(&mm->mmap_sem);
+			kfree(hmm);
+			goto error;
+		}
+
+		mm->hmm = hmm;
+	}
+
+	mirror->hmm = hmm;
+	ret = hmm_add_mirror(hmm, mirror);
+	up_write(&mm->mmap_sem);
+	if (ret) {
+		mirror->hmm = NULL;
+		hmm_unref(hmm);
+		goto error;
+	}
+	return 0;
+
+error:
+	mutex_lock(&mirror->device->mutex);
+	list_del_init(&mirror->dlist);
+	mutex_unlock(&mirror->device->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(hmm_mirror_register);
+
+static void hmm_mirror_kill(struct hmm_mirror *mirror)
+{
+	down_write(&mirror->hmm->rwsem);
+	if (!hlist_unhashed(&mirror->mlist)) {
+		hlist_del_init(&mirror->mlist);
+		up_write(&mirror->hmm->rwsem);
+
+		hmm_mirror_unref(&mirror);
+	} else
+		up_write(&mirror->hmm->rwsem);
+}
+
+/* hmm_mirror_unregister() - unregister a mirror.
+ *
+ * @mirror: The mirror that link process address space with the device.
+ *
+ * Driver can call this function when it wants to stop mirroring a process.
+ * This will trigger a call to the ->release() callback if it did not aleady
+ * happen.
+ *
+ * Note that caller must hold a reference on the mirror.
+ */
+void hmm_mirror_unregister(struct hmm_mirror *mirror)
+{
+	if (mirror == NULL)
+		return;
+
+	hmm_mirror_kill(mirror);
+	hmm_mirror_unref(&mirror);
+}
+EXPORT_SYMBOL(hmm_mirror_unregister);
+
+
+/* hmm_device - Each device driver must register one and only one hmm_device
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* hmm_device_register() - register a device with HMM.
+ *
+ * @device: The hmm_device struct.
+ * Returns: 0 on success or -EINVAL otherwise.
+ *
+ *
+ * Call when device driver want to register itself with HMM. Device driver must
+ * only register once.
+ */
+int hmm_device_register(struct hmm_device *device)
+{
+	/* sanity check */
+	BUG_ON(!device);
+	BUG_ON(!device->ops);
+	BUG_ON(!device->ops->release);
+
+	mutex_init(&device->mutex);
+	INIT_LIST_HEAD(&device->mirrors);
+
+	return 0;
+}
+EXPORT_SYMBOL(hmm_device_register);
+
+/* hmm_device_unregister() - unregister a device with HMM.
+ *
+ * @device: The hmm_device struct.
+ * Returns: 0 on success or -EBUSY otherwise.
+ *
+ * Call when device driver want to unregister itself with HMM. This will check
+ * that there is no any active mirror and returns -EBUSY if so.
+ */
+int hmm_device_unregister(struct hmm_device *device)
+{
+	mutex_lock(&device->mutex);
+	if (!list_empty(&device->mirrors)) {
+		mutex_unlock(&device->mutex);
+		return -EBUSY;
+	}
+	mutex_unlock(&device->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(hmm_device_unregister);