[05/10] dax: Allow DAX code to replace exceptional entries
diff mbox

Message ID 1458566575-28063-6-git-send-email-jack@suse.cz
State New
Headers show

Commit Message

Jan Kara March 21, 2016, 1:22 p.m. UTC
Currently we forbid page_cache_tree_insert() to replace exceptional radix
tree entries for DAX inodes. However to make DAX faults race free we will
lock radix tree entries and when hole is created, we need to replace
such locked radix tree entry with a hole page. So modify
page_cache_tree_insert() to allow that.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/dax.h |  6 ++++++
 mm/filemap.c        | 18 +++++++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

Comments

Ross Zwisler March 23, 2016, 5:52 p.m. UTC | #1
On Mon, Mar 21, 2016 at 02:22:50PM +0100, Jan Kara wrote:
> Currently we forbid page_cache_tree_insert() to replace exceptional radix
> tree entries for DAX inodes. However to make DAX faults race free we will
> lock radix tree entries and when hole is created, we need to replace
> such locked radix tree entry with a hole page. So modify
> page_cache_tree_insert() to allow that.

Perhaps this is addressed later in the series, but at first glance this seems
unsafe to me - what happens of we had tasks waiting on the locked entry?  Are
they woken up when we replace the locked entry with a page cache entry?

If they are woken up, will they be alright with the fact that they were
sleeping on a lock for an exceptional entry, and now that lock no longer
exists?  The radix tree entry will now be filled with a zero page hole, and so
they can't acquire the lock they were sleeping for - instead if they wanted to
lock that page they'd need to lock the zero page page lock, right?

> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  include/linux/dax.h |  6 ++++++
>  mm/filemap.c        | 18 +++++++++++-------
>  2 files changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 7c45ac7ea1d1..4b63923e1f8d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -3,8 +3,14 @@
>  
>  #include <linux/fs.h>
>  #include <linux/mm.h>
> +#include <linux/radix-tree.h>
>  #include <asm/pgtable.h>
>  
> +/*
> + * Since exceptional entries do not use indirect bit, we reuse it as a lock bit
> + */
> +#define DAX_ENTRY_LOCK RADIX_TREE_INDIRECT_PTR
> +
>  ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
>  		  get_block_t, dio_iodone_t, int flags);
>  int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 7c00f105845e..fbebedaf719e 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -597,14 +597,18 @@ static int page_cache_tree_insert(struct address_space *mapping,
>  		if (!radix_tree_exceptional_entry(p))
>  			return -EEXIST;
>  
> -		if (WARN_ON(dax_mapping(mapping)))
> -			return -EINVAL;
> -
> -		if (shadowp)
> -			*shadowp = p;
>  		mapping->nrexceptional--;
> -		if (node)
> -			workingset_node_shadows_dec(node);
> +		if (!dax_mapping(mapping)) {
> +			if (shadowp)
> +				*shadowp = p;
> +			if (node)
> +				workingset_node_shadows_dec(node);
> +		} else {
> +			/* DAX can replace empty locked entry with a hole */
> +			WARN_ON_ONCE(p !=
> +				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
> +					 DAX_ENTRY_LOCK));
> +		}
>  	}
>  	radix_tree_replace_slot(slot, page);
>  	mapping->nrpages++;
> -- 
> 2.6.2
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara March 24, 2016, 10:42 a.m. UTC | #2
On Wed 23-03-16 11:52:58, Ross Zwisler wrote:
> On Mon, Mar 21, 2016 at 02:22:50PM +0100, Jan Kara wrote:
> > Currently we forbid page_cache_tree_insert() to replace exceptional radix
> > tree entries for DAX inodes. However to make DAX faults race free we will
> > lock radix tree entries and when hole is created, we need to replace
> > such locked radix tree entry with a hole page. So modify
> > page_cache_tree_insert() to allow that.
> 
> Perhaps this is addressed later in the series, but at first glance this seems
> unsafe to me - what happens of we had tasks waiting on the locked entry?  Are
> they woken up when we replace the locked entry with a page cache entry?
> 
> If they are woken up, will they be alright with the fact that they were
> sleeping on a lock for an exceptional entry, and now that lock no longer
> exists?  The radix tree entry will now be filled with a zero page hole, and so
> they can't acquire the lock they were sleeping for - instead if they wanted to
> lock that page they'd need to lock the zero page page lock, right?

Correct. What you describes can happen and the DAX entry locking code
counts with that. I guess I could comment on that in the changelog.

								Honza
> 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  include/linux/dax.h |  6 ++++++
> >  mm/filemap.c        | 18 +++++++++++-------
> >  2 files changed, 17 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/linux/dax.h b/include/linux/dax.h
> > index 7c45ac7ea1d1..4b63923e1f8d 100644
> > --- a/include/linux/dax.h
> > +++ b/include/linux/dax.h
> > @@ -3,8 +3,14 @@
> >  
> >  #include <linux/fs.h>
> >  #include <linux/mm.h>
> > +#include <linux/radix-tree.h>
> >  #include <asm/pgtable.h>
> >  
> > +/*
> > + * Since exceptional entries do not use indirect bit, we reuse it as a lock bit
> > + */
> > +#define DAX_ENTRY_LOCK RADIX_TREE_INDIRECT_PTR
> > +
> >  ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
> >  		  get_block_t, dio_iodone_t, int flags);
> >  int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 7c00f105845e..fbebedaf719e 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -597,14 +597,18 @@ static int page_cache_tree_insert(struct address_space *mapping,
> >  		if (!radix_tree_exceptional_entry(p))
> >  			return -EEXIST;
> >  
> > -		if (WARN_ON(dax_mapping(mapping)))
> > -			return -EINVAL;
> > -
> > -		if (shadowp)
> > -			*shadowp = p;
> >  		mapping->nrexceptional--;
> > -		if (node)
> > -			workingset_node_shadows_dec(node);
> > +		if (!dax_mapping(mapping)) {
> > +			if (shadowp)
> > +				*shadowp = p;
> > +			if (node)
> > +				workingset_node_shadows_dec(node);
> > +		} else {
> > +			/* DAX can replace empty locked entry with a hole */
> > +			WARN_ON_ONCE(p !=
> > +				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
> > +					 DAX_ENTRY_LOCK));
> > +		}
> >  	}
> >  	radix_tree_replace_slot(slot, page);
> >  	mapping->nrpages++;
> > -- 
> > 2.6.2
> >

Patch
diff mbox

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7c45ac7ea1d1..4b63923e1f8d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -3,8 +3,14 @@ 
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
+/*
+ * Since exceptional entries do not use indirect bit, we reuse it as a lock bit
+ */
+#define DAX_ENTRY_LOCK RADIX_TREE_INDIRECT_PTR
+
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 		  get_block_t, dio_iodone_t, int flags);
 int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7c00f105845e..fbebedaf719e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -597,14 +597,18 @@  static int page_cache_tree_insert(struct address_space *mapping,
 		if (!radix_tree_exceptional_entry(p))
 			return -EEXIST;
 
-		if (WARN_ON(dax_mapping(mapping)))
-			return -EINVAL;
-
-		if (shadowp)
-			*shadowp = p;
 		mapping->nrexceptional--;
-		if (node)
-			workingset_node_shadows_dec(node);
+		if (!dax_mapping(mapping)) {
+			if (shadowp)
+				*shadowp = p;
+			if (node)
+				workingset_node_shadows_dec(node);
+		} else {
+			/* DAX can replace empty locked entry with a hole */
+			WARN_ON_ONCE(p !=
+				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+					 DAX_ENTRY_LOCK));
+		}
 	}
 	radix_tree_replace_slot(slot, page);
 	mapping->nrpages++;