diff mbox

[v4,1/2] shmem: Support for registration of driver/file owner specific ops

Message ID 1459775891-32442-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 4, 2016, 1:18 p.m. UTC
From: Akash Goel <akash.goel@intel.com>

This provides support for the drivers or shmem file owners to register
a set of callbacks, which can be invoked from the address space
operations methods implemented by shmem.  This allow the file owners to
hook into the shmem address space operations to do some extra/custom
operations in addition to the default ones.

The private_data field of address_space struct is used to store the
pointer to driver specific ops.  Currently only one ops field is defined,
which is migratepage, but can be extended on an as-needed basis.

The need for driver specific operations arises since some of the
operations (like migratepage) may not be handled completely within shmem,
so as to be effective, and would need some driver specific handling also.
Specifically, i915.ko would like to participate in migratepage().
i915.ko uses shmemfs to provide swappable backing storage for its user
objects, but when those objects are in use by the GPU it must pin the
entire object until the GPU is idle.  As a result, large chunks of memory
can be arbitrarily withdrawn from page migration, resulting in premature
out-of-memory due to fragmentation.  However, if i915.ko can receive the
migratepage() request, it can then flush the object from the GPU, remove
its pin and thus enable the migration.

Since gfx allocations are one of the major consumer of system memory, its
imperative to have such a mechanism to effectively deal with
fragmentation.  And therefore the need for such a provision for initiating
driver specific actions during address space operations.

Cc: Hugh Dickins <hughd@google.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.linux.org
Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 include/linux/shmem_fs.h | 17 +++++++++++++++++
 mm/shmem.c               | 17 ++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

Comments

Chris Wilson April 15, 2016, 4:09 p.m. UTC | #1
On Mon, Apr 04, 2016 at 02:18:10PM +0100, Chris Wilson wrote:
> From: Akash Goel <akash.goel@intel.com>
> 
> This provides support for the drivers or shmem file owners to register
> a set of callbacks, which can be invoked from the address space
> operations methods implemented by shmem.  This allow the file owners to
> hook into the shmem address space operations to do some extra/custom
> operations in addition to the default ones.
> 
> The private_data field of address_space struct is used to store the
> pointer to driver specific ops.  Currently only one ops field is defined,
> which is migratepage, but can be extended on an as-needed basis.
> 
> The need for driver specific operations arises since some of the
> operations (like migratepage) may not be handled completely within shmem,
> so as to be effective, and would need some driver specific handling also.
> Specifically, i915.ko would like to participate in migratepage().
> i915.ko uses shmemfs to provide swappable backing storage for its user
> objects, but when those objects are in use by the GPU it must pin the
> entire object until the GPU is idle.  As a result, large chunks of memory
> can be arbitrarily withdrawn from page migration, resulting in premature
> out-of-memory due to fragmentation.  However, if i915.ko can receive the
> migratepage() request, it can then flush the object from the GPU, remove
> its pin and thus enable the migration.
> 
> Since gfx allocations are one of the major consumer of system memory, its
> imperative to have such a mechanism to effectively deal with
> fragmentation.  And therefore the need for such a provision for initiating
> driver specific actions during address space operations.
> 
> Cc: Hugh Dickins <hughd@google.com>
> Cc: linux-mm@kvack.org
> Cc: linux-kernel@vger.linux.org
> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Ping?

> ---
>  include/linux/shmem_fs.h | 17 +++++++++++++++++
>  mm/shmem.c               | 17 ++++++++++++++++-
>  2 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 4d4780c00d34..d7925b66c240 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -34,11 +34,28 @@ struct shmem_sb_info {
>  	struct mempolicy *mpol;     /* default memory policy for mappings */
>  };
>  
> +struct shmem_dev_info {
> +	void *dev_private_data;
> +	int (*dev_migratepage)(struct address_space *mapping,
> +			       struct page *newpage, struct page *page,
> +			       enum migrate_mode mode, void *dev_priv_data);
> +};
> +
>  static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
>  {
>  	return container_of(inode, struct shmem_inode_info, vfs_inode);
>  }
>  
> +static inline int shmem_set_device_ops(struct address_space *mapping,
> +				       struct shmem_dev_info *info)
> +{
> +	if (mapping->private_data != NULL)
> +		return -EEXIST;
> +
> +	mapping->private_data = info;
> +	return 0;
> +}
> +
>  /*
>   * Functions in mm/shmem.c called directly from elsewhere:
>   */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9428c51ab2d6..6ed953193883 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -947,6 +947,21 @@ redirty:
>  	return 0;
>  }
>  
> +#ifdef CONFIG_MIGRATION
> +static int shmem_migratepage(struct address_space *mapping,
> +			     struct page *newpage, struct page *page,
> +			     enum migrate_mode mode)
> +{
> +	struct shmem_dev_info *dev_info = mapping->private_data;
> +
> +	if (dev_info && dev_info->dev_migratepage)
> +		return dev_info->dev_migratepage(mapping, newpage, page,
> +				mode, dev_info->dev_private_data);
> +
> +	return migrate_page(mapping, newpage, page, mode);
> +}
> +#endif
> +
>  #ifdef CONFIG_NUMA
>  #ifdef CONFIG_TMPFS
>  static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
> @@ -3161,7 +3176,7 @@ static const struct address_space_operations shmem_aops = {
>  	.write_end	= shmem_write_end,
>  #endif
>  #ifdef CONFIG_MIGRATION
> -	.migratepage	= migrate_page,
> +	.migratepage	= shmem_migratepage,
>  #endif
>  	.error_remove_page = generic_error_remove_page,
>  };
Daniel Vetter April 20, 2016, 12:38 p.m. UTC | #2
On Fri, Apr 15, 2016 at 05:09:24PM +0100, Chris Wilson wrote:
> On Mon, Apr 04, 2016 at 02:18:10PM +0100, Chris Wilson wrote:
> > From: Akash Goel <akash.goel@intel.com>
> > 
> > This provides support for the drivers or shmem file owners to register
> > a set of callbacks, which can be invoked from the address space
> > operations methods implemented by shmem.  This allow the file owners to
> > hook into the shmem address space operations to do some extra/custom
> > operations in addition to the default ones.
> > 
> > The private_data field of address_space struct is used to store the
> > pointer to driver specific ops.  Currently only one ops field is defined,
> > which is migratepage, but can be extended on an as-needed basis.
> > 
> > The need for driver specific operations arises since some of the
> > operations (like migratepage) may not be handled completely within shmem,
> > so as to be effective, and would need some driver specific handling also.
> > Specifically, i915.ko would like to participate in migratepage().
> > i915.ko uses shmemfs to provide swappable backing storage for its user
> > objects, but when those objects are in use by the GPU it must pin the
> > entire object until the GPU is idle.  As a result, large chunks of memory
> > can be arbitrarily withdrawn from page migration, resulting in premature
> > out-of-memory due to fragmentation.  However, if i915.ko can receive the
> > migratepage() request, it can then flush the object from the GPU, remove
> > its pin and thus enable the migration.
> > 
> > Since gfx allocations are one of the major consumer of system memory, its
> > imperative to have such a mechanism to effectively deal with
> > fragmentation.  And therefore the need for such a provision for initiating
> > driver specific actions during address space operations.
> > 
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: linux-mm@kvack.org
> > Cc: linux-kernel@vger.linux.org
> > Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
> > Signed-off-by: Akash Goel <akash.goel@intel.com>
> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Ping?

This addresses a long-standing issue we have in i915 gem of essentially
pinning way to much memory as unmoveable. Would be nice to get this
merged. It also does seem to fix some real-world issues we're seeing
(linked from the i915 patch).

Can I have acks/reviews from -mm folks to get this in through drm trees
please?

Thanks, Daniel

> 
> > ---
> >  include/linux/shmem_fs.h | 17 +++++++++++++++++
> >  mm/shmem.c               | 17 ++++++++++++++++-
> >  2 files changed, 33 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> > index 4d4780c00d34..d7925b66c240 100644
> > --- a/include/linux/shmem_fs.h
> > +++ b/include/linux/shmem_fs.h
> > @@ -34,11 +34,28 @@ struct shmem_sb_info {
> >  	struct mempolicy *mpol;     /* default memory policy for mappings */
> >  };
> >  
> > +struct shmem_dev_info {
> > +	void *dev_private_data;
> > +	int (*dev_migratepage)(struct address_space *mapping,
> > +			       struct page *newpage, struct page *page,
> > +			       enum migrate_mode mode, void *dev_priv_data);
> > +};
> > +
> >  static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
> >  {
> >  	return container_of(inode, struct shmem_inode_info, vfs_inode);
> >  }
> >  
> > +static inline int shmem_set_device_ops(struct address_space *mapping,
> > +				       struct shmem_dev_info *info)
> > +{
> > +	if (mapping->private_data != NULL)
> > +		return -EEXIST;
> > +
> > +	mapping->private_data = info;
> > +	return 0;
> > +}
> > +
> >  /*
> >   * Functions in mm/shmem.c called directly from elsewhere:
> >   */
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 9428c51ab2d6..6ed953193883 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -947,6 +947,21 @@ redirty:
> >  	return 0;
> >  }
> >  
> > +#ifdef CONFIG_MIGRATION
> > +static int shmem_migratepage(struct address_space *mapping,
> > +			     struct page *newpage, struct page *page,
> > +			     enum migrate_mode mode)
> > +{
> > +	struct shmem_dev_info *dev_info = mapping->private_data;
> > +
> > +	if (dev_info && dev_info->dev_migratepage)
> > +		return dev_info->dev_migratepage(mapping, newpage, page,
> > +				mode, dev_info->dev_private_data);
> > +
> > +	return migrate_page(mapping, newpage, page, mode);
> > +}
> > +#endif
> > +
> >  #ifdef CONFIG_NUMA
> >  #ifdef CONFIG_TMPFS
> >  static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
> > @@ -3161,7 +3176,7 @@ static const struct address_space_operations shmem_aops = {
> >  	.write_end	= shmem_write_end,
> >  #endif
> >  #ifdef CONFIG_MIGRATION
> > -	.migratepage	= migrate_page,
> > +	.migratepage	= shmem_migratepage,
> >  #endif
> >  	.error_remove_page = generic_error_remove_page,
> >  };
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
Kirill A . Shutemov April 24, 2016, 11:42 p.m. UTC | #3
On Mon, Apr 04, 2016 at 02:18:10PM +0100, Chris Wilson wrote:
> From: Akash Goel <akash.goel@intel.com>
> 
> This provides support for the drivers or shmem file owners to register
> a set of callbacks, which can be invoked from the address space
> operations methods implemented by shmem.  This allow the file owners to
> hook into the shmem address space operations to do some extra/custom
> operations in addition to the default ones.
> 
> The private_data field of address_space struct is used to store the
> pointer to driver specific ops.  Currently only one ops field is defined,
> which is migratepage, but can be extended on an as-needed basis.
> 
> The need for driver specific operations arises since some of the
> operations (like migratepage) may not be handled completely within shmem,
> so as to be effective, and would need some driver specific handling also.
> Specifically, i915.ko would like to participate in migratepage().
> i915.ko uses shmemfs to provide swappable backing storage for its user
> objects, but when those objects are in use by the GPU it must pin the
> entire object until the GPU is idle.  As a result, large chunks of memory
> can be arbitrarily withdrawn from page migration, resulting in premature
> out-of-memory due to fragmentation.  However, if i915.ko can receive the
> migratepage() request, it can then flush the object from the GPU, remove
> its pin and thus enable the migration.
> 
> Since gfx allocations are one of the major consumer of system memory, its
> imperative to have such a mechanism to effectively deal with
> fragmentation.  And therefore the need for such a provision for initiating
> driver specific actions during address space operations.

Hm. Sorry, my ignorance, but shouldn't this kind of flushing be done in
response to mmu_notifier's ->invalidate_page?

I'm not aware about how i915 works and what's its expectation wrt shmem.
Do you have some userspace VMA which is mirrored on GPU side?
If yes, migration would cause unmapping of these pages and trigger the
mmu_notifier's hook.
Daniel Vetter April 26, 2016, 12:53 p.m. UTC | #4
On Mon, Apr 25, 2016 at 02:42:50AM +0300, Kirill A. Shutemov wrote:
> On Mon, Apr 04, 2016 at 02:18:10PM +0100, Chris Wilson wrote:
> > From: Akash Goel <akash.goel@intel.com>
> > 
> > This provides support for the drivers or shmem file owners to register
> > a set of callbacks, which can be invoked from the address space
> > operations methods implemented by shmem.  This allow the file owners to
> > hook into the shmem address space operations to do some extra/custom
> > operations in addition to the default ones.
> > 
> > The private_data field of address_space struct is used to store the
> > pointer to driver specific ops.  Currently only one ops field is defined,
> > which is migratepage, but can be extended on an as-needed basis.
> > 
> > The need for driver specific operations arises since some of the
> > operations (like migratepage) may not be handled completely within shmem,
> > so as to be effective, and would need some driver specific handling also.
> > Specifically, i915.ko would like to participate in migratepage().
> > i915.ko uses shmemfs to provide swappable backing storage for its user
> > objects, but when those objects are in use by the GPU it must pin the
> > entire object until the GPU is idle.  As a result, large chunks of memory
> > can be arbitrarily withdrawn from page migration, resulting in premature
> > out-of-memory due to fragmentation.  However, if i915.ko can receive the
> > migratepage() request, it can then flush the object from the GPU, remove
> > its pin and thus enable the migration.
> > 
> > Since gfx allocations are one of the major consumer of system memory, its
> > imperative to have such a mechanism to effectively deal with
> > fragmentation.  And therefore the need for such a provision for initiating
> > driver specific actions during address space operations.
> 
> Hm. Sorry, my ignorance, but shouldn't this kind of flushing be done in
> response to mmu_notifier's ->invalidate_page?
> 
> I'm not aware about how i915 works and what's its expectation wrt shmem.
> Do you have some userspace VMA which is mirrored on GPU side?
> If yes, migration would cause unmapping of these pages and trigger the
> mmu_notifier's hook.

We do that for userptr pages (i.e. stuff we steal from userspace address
spaces). But we also have native gfx buffer objects based on shmem files,
and thus far we need to allocate them as !GFP_MOVEABLE. And we allocate a
_lot_ of those. And those files aren't mapped into any cpu address space
(ofc they're mapped on the gpu side, but that's driver private), from the
core mm they are pure pagecache. And afaiui for that we need to wire up
the migratepage hooks through shmem to i915_gem.c
-Daniel
Kirill A . Shutemov April 26, 2016, 11:33 p.m. UTC | #5
On Tue, Apr 26, 2016 at 02:53:41PM +0200, Daniel Vetter wrote:
> On Mon, Apr 25, 2016 at 02:42:50AM +0300, Kirill A. Shutemov wrote:
> > On Mon, Apr 04, 2016 at 02:18:10PM +0100, Chris Wilson wrote:
> > > From: Akash Goel <akash.goel@intel.com>
> > > 
> > > This provides support for the drivers or shmem file owners to register
> > > a set of callbacks, which can be invoked from the address space
> > > operations methods implemented by shmem.  This allow the file owners to
> > > hook into the shmem address space operations to do some extra/custom
> > > operations in addition to the default ones.
> > > 
> > > The private_data field of address_space struct is used to store the
> > > pointer to driver specific ops.  Currently only one ops field is defined,
> > > which is migratepage, but can be extended on an as-needed basis.
> > > 
> > > The need for driver specific operations arises since some of the
> > > operations (like migratepage) may not be handled completely within shmem,
> > > so as to be effective, and would need some driver specific handling also.
> > > Specifically, i915.ko would like to participate in migratepage().
> > > i915.ko uses shmemfs to provide swappable backing storage for its user
> > > objects, but when those objects are in use by the GPU it must pin the
> > > entire object until the GPU is idle.  As a result, large chunks of memory
> > > can be arbitrarily withdrawn from page migration, resulting in premature
> > > out-of-memory due to fragmentation.  However, if i915.ko can receive the
> > > migratepage() request, it can then flush the object from the GPU, remove
> > > its pin and thus enable the migration.
> > > 
> > > Since gfx allocations are one of the major consumer of system memory, its
> > > imperative to have such a mechanism to effectively deal with
> > > fragmentation.  And therefore the need for such a provision for initiating
> > > driver specific actions during address space operations.
> > 
> > Hm. Sorry, my ignorance, but shouldn't this kind of flushing be done in
> > response to mmu_notifier's ->invalidate_page?
> > 
> > I'm not aware about how i915 works and what's its expectation wrt shmem.
> > Do you have some userspace VMA which is mirrored on GPU side?
> > If yes, migration would cause unmapping of these pages and trigger the
> > mmu_notifier's hook.
> 
> We do that for userptr pages (i.e. stuff we steal from userspace address
> spaces). But we also have native gfx buffer objects based on shmem files,
> and thus far we need to allocate them as !GFP_MOVEABLE. And we allocate a
> _lot_ of those. And those files aren't mapped into any cpu address space
> (ofc they're mapped on the gpu side, but that's driver private), from the
> core mm they are pure pagecache. And afaiui for that we need to wire up
> the migratepage hooks through shmem to i915_gem.c

I see.

I don't particularly like the way patch hooks into migrate, but don't a
good idea how to implement this better.

This way allows to hook up to any shmem file, which can be abused by
drivers later.

I wounder if it would be better for i915 to have its own in-kernel mount
with variant of tmpfs which provides different mapping->a_ops? Or is it
overkill? I don't know.

Hugh?
Hugh Dickins April 27, 2016, 7:16 a.m. UTC | #6
On Wed, 27 Apr 2016, Kirill A. Shutemov wrote:
> On Tue, Apr 26, 2016 at 02:53:41PM +0200, Daniel Vetter wrote:
> > On Mon, Apr 25, 2016 at 02:42:50AM +0300, Kirill A. Shutemov wrote:
> > > On Mon, Apr 04, 2016 at 02:18:10PM +0100, Chris Wilson wrote:
> > > > From: Akash Goel <akash.goel@intel.com>
> > > > 
> > > > This provides support for the drivers or shmem file owners to register
> > > > a set of callbacks, which can be invoked from the address space
> > > > operations methods implemented by shmem.  This allow the file owners to
> > > > hook into the shmem address space operations to do some extra/custom
> > > > operations in addition to the default ones.
> > > > 
> > > > The private_data field of address_space struct is used to store the
> > > > pointer to driver specific ops.  Currently only one ops field is defined,
> > > > which is migratepage, but can be extended on an as-needed basis.
> > > > 
> > > > The need for driver specific operations arises since some of the
> > > > operations (like migratepage) may not be handled completely within shmem,
> > > > so as to be effective, and would need some driver specific handling also.
> > > > Specifically, i915.ko would like to participate in migratepage().
> > > > i915.ko uses shmemfs to provide swappable backing storage for its user
> > > > objects, but when those objects are in use by the GPU it must pin the
> > > > entire object until the GPU is idle.  As a result, large chunks of memory
> > > > can be arbitrarily withdrawn from page migration, resulting in premature
> > > > out-of-memory due to fragmentation.  However, if i915.ko can receive the
> > > > migratepage() request, it can then flush the object from the GPU, remove
> > > > its pin and thus enable the migration.
> > > > 
> > > > Since gfx allocations are one of the major consumer of system memory, its
> > > > imperative to have such a mechanism to effectively deal with
> > > > fragmentation.  And therefore the need for such a provision for initiating
> > > > driver specific actions during address space operations.
> > > 
> > > Hm. Sorry, my ignorance, but shouldn't this kind of flushing be done in
> > > response to mmu_notifier's ->invalidate_page?
> > > 
> > > I'm not aware about how i915 works and what's its expectation wrt shmem.
> > > Do you have some userspace VMA which is mirrored on GPU side?
> > > If yes, migration would cause unmapping of these pages and trigger the
> > > mmu_notifier's hook.
> > 
> > We do that for userptr pages (i.e. stuff we steal from userspace address
> > spaces). But we also have native gfx buffer objects based on shmem files,
> > and thus far we need to allocate them as !GFP_MOVEABLE. And we allocate a
> > _lot_ of those. And those files aren't mapped into any cpu address space
> > (ofc they're mapped on the gpu side, but that's driver private), from the
> > core mm they are pure pagecache. And afaiui for that we need to wire up
> > the migratepage hooks through shmem to i915_gem.c
> 
> I see.
> 
> I don't particularly like the way patch hooks into migrate, but don't a
> good idea how to implement this better.
> 
> This way allows to hook up to any shmem file, which can be abused by
> drivers later.
> 
> I wounder if it would be better for i915 to have its own in-kernel mount
> with variant of tmpfs which provides different mapping->a_ops? Or is it
> overkill? I don't know.
> 
> Hugh?

This, and the 2/2, remain perpetually in my "needs more thought" box.
And won't get that thought today either, I'm afraid.  Tomorrow.

Like you, I don't particularly like these; but recognize that the i915
guys are doing all the rest of us a big favour by going to some trouble
to allow migration of their pinned pages.

Potential for abuse of migratepage by drivers is already there anyway:
we can be grateful that they're offering to use rather than abuse it;
but yes, it's a worry that such trickiness gets dispersed into drivers.

Amusing to see them grabbing shmem's page_private(), isn't it?

Hugh
Daniel Vetter April 27, 2016, 7:38 a.m. UTC | #7
On Wed, Apr 27, 2016 at 12:16:37AM -0700, Hugh Dickins wrote:
> On Wed, 27 Apr 2016, Kirill A. Shutemov wrote:
> > On Tue, Apr 26, 2016 at 02:53:41PM +0200, Daniel Vetter wrote:
> > > On Mon, Apr 25, 2016 at 02:42:50AM +0300, Kirill A. Shutemov wrote:
> > > > On Mon, Apr 04, 2016 at 02:18:10PM +0100, Chris Wilson wrote:
> > > > > From: Akash Goel <akash.goel@intel.com>
> > > > > 
> > > > > This provides support for the drivers or shmem file owners to register
> > > > > a set of callbacks, which can be invoked from the address space
> > > > > operations methods implemented by shmem.  This allow the file owners to
> > > > > hook into the shmem address space operations to do some extra/custom
> > > > > operations in addition to the default ones.
> > > > > 
> > > > > The private_data field of address_space struct is used to store the
> > > > > pointer to driver specific ops.  Currently only one ops field is defined,
> > > > > which is migratepage, but can be extended on an as-needed basis.
> > > > > 
> > > > > The need for driver specific operations arises since some of the
> > > > > operations (like migratepage) may not be handled completely within shmem,
> > > > > so as to be effective, and would need some driver specific handling also.
> > > > > Specifically, i915.ko would like to participate in migratepage().
> > > > > i915.ko uses shmemfs to provide swappable backing storage for its user
> > > > > objects, but when those objects are in use by the GPU it must pin the
> > > > > entire object until the GPU is idle.  As a result, large chunks of memory
> > > > > can be arbitrarily withdrawn from page migration, resulting in premature
> > > > > out-of-memory due to fragmentation.  However, if i915.ko can receive the
> > > > > migratepage() request, it can then flush the object from the GPU, remove
> > > > > its pin and thus enable the migration.
> > > > > 
> > > > > Since gfx allocations are one of the major consumer of system memory, its
> > > > > imperative to have such a mechanism to effectively deal with
> > > > > fragmentation.  And therefore the need for such a provision for initiating
> > > > > driver specific actions during address space operations.
> > > > 
> > > > Hm. Sorry, my ignorance, but shouldn't this kind of flushing be done in
> > > > response to mmu_notifier's ->invalidate_page?
> > > > 
> > > > I'm not aware about how i915 works and what's its expectation wrt shmem.
> > > > Do you have some userspace VMA which is mirrored on GPU side?
> > > > If yes, migration would cause unmapping of these pages and trigger the
> > > > mmu_notifier's hook.
> > > 
> > > We do that for userptr pages (i.e. stuff we steal from userspace address
> > > spaces). But we also have native gfx buffer objects based on shmem files,
> > > and thus far we need to allocate them as !GFP_MOVEABLE. And we allocate a
> > > _lot_ of those. And those files aren't mapped into any cpu address space
> > > (ofc they're mapped on the gpu side, but that's driver private), from the
> > > core mm they are pure pagecache. And afaiui for that we need to wire up
> > > the migratepage hooks through shmem to i915_gem.c
> > 
> > I see.
> > 
> > I don't particularly like the way patch hooks into migrate, but don't a
> > good idea how to implement this better.
> > 
> > This way allows to hook up to any shmem file, which can be abused by
> > drivers later.
> > 
> > I wounder if it would be better for i915 to have its own in-kernel mount
> > with variant of tmpfs which provides different mapping->a_ops? Or is it
> > overkill? I don't know.
> > 
> > Hugh?
> 
> This, and the 2/2, remain perpetually in my "needs more thought" box.
> And won't get that thought today either, I'm afraid.  Tomorrow.
> 
> Like you, I don't particularly like these; but recognize that the i915
> guys are doing all the rest of us a big favour by going to some trouble
> to allow migration of their pinned pages.
> 
> Potential for abuse of migratepage by drivers is already there anyway:
> we can be grateful that they're offering to use rather than abuse it;
> but yes, it's a worry that such trickiness gets dispersed into drivers.

Looking at our internal roadmap it'll likely get a lot worse, and in a few
years you'll have i915 asking the core mm politely to move around pages
for it because they're place suboptimally for gpu access. It'll be fun.

We don't have a prototype yet at all even internally, but I think that's
another reason why a more cozy relationship between i915 and shmem would
be good. Not sure you want that, or whether we should resurrect the old
idea of a gemfs.
-Daniel
diff mbox

Patch

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d4780c00d34..d7925b66c240 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -34,11 +34,28 @@  struct shmem_sb_info {
 	struct mempolicy *mpol;     /* default memory policy for mappings */
 };
 
+struct shmem_dev_info {
+	void *dev_private_data;
+	int (*dev_migratepage)(struct address_space *mapping,
+			       struct page *newpage, struct page *page,
+			       enum migrate_mode mode, void *dev_priv_data);
+};
+
 static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 {
 	return container_of(inode, struct shmem_inode_info, vfs_inode);
 }
 
+static inline int shmem_set_device_ops(struct address_space *mapping,
+				       struct shmem_dev_info *info)
+{
+	if (mapping->private_data != NULL)
+		return -EEXIST;
+
+	mapping->private_data = info;
+	return 0;
+}
+
 /*
  * Functions in mm/shmem.c called directly from elsewhere:
  */
diff --git a/mm/shmem.c b/mm/shmem.c
index 9428c51ab2d6..6ed953193883 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -947,6 +947,21 @@  redirty:
 	return 0;
 }
 
+#ifdef CONFIG_MIGRATION
+static int shmem_migratepage(struct address_space *mapping,
+			     struct page *newpage, struct page *page,
+			     enum migrate_mode mode)
+{
+	struct shmem_dev_info *dev_info = mapping->private_data;
+
+	if (dev_info && dev_info->dev_migratepage)
+		return dev_info->dev_migratepage(mapping, newpage, page,
+				mode, dev_info->dev_private_data);
+
+	return migrate_page(mapping, newpage, page, mode);
+}
+#endif
+
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -3161,7 +3176,7 @@  static const struct address_space_operations shmem_aops = {
 	.write_end	= shmem_write_end,
 #endif
 #ifdef CONFIG_MIGRATION
-	.migratepage	= migrate_page,
+	.migratepage	= shmem_migratepage,
 #endif
 	.error_remove_page = generic_error_remove_page,
 };