diff mbox

[v4,10/12] dax: add struct iomap based DAX PMD support

Message ID 1475189370-31634-11-git-send-email-ross.zwisler@linux.intel.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Ross Zwisler Sept. 29, 2016, 10:49 p.m. UTC
DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
locking.  This patch allows DAX PMDs to participate in the DAX radix tree
based locking scheme so that they can be re-enabled using the new struct
iomap based fault handlers.

There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
mappings that have an associated block allocation, and 4k DAX empty
entries.  The empty entries exist to provide locking for the duration of a
given page fault.

This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
entries, PMD DAX entries that have associated block allocations, and 2 MiB
DAX empty entries.

Unlike the 4k case where we insert a struct page* into the radix tree for
4k zero pages, for HZP we insert a DAX exceptional entry with the new
RADIX_DAX_HZP flag set.  This is because we use a single 2 MiB zero page in
every 2MiB hole mapping, and it doesn't make sense to have that same struct
page* with multiple entries in multiple trees.  This would cause contention
on the single page lock for the one Huge Zero Page, and it would break the
page->index and page->mapping associations that are assumed to be valid in
many other places in the kernel.

One difficult use case is when one thread is trying to use 4k entries in
radix tree for a given offset, and another thread is using 2 MiB entries
for that same offset.  The current code handles this by making the 2 MiB
user fall back to 4k entries for most cases.  This was done because it is
the simplest solution, and because the use of 2MiB pages is already
opportunistic.

If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
we run into the problem of how we lock out 4k page faults for the entire
2MiB range while we clean out the radix tree so we can insert the 2MiB
entry.  We can solve this problem if we need to, but I think that the cases
where both 2MiB entries and 4K entries are being used for the same range
will be rare enough and the gain small enough that it probably won't be
worth the complexity.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 334 +++++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/dax.h |  38 +++++-
 mm/filemap.c        |   4 +-
 3 files changed, 327 insertions(+), 49 deletions(-)

Comments

Christoph Hellwig Sept. 30, 2016, 9:56 a.m. UTC | #1
> -/*
> - * We use lowest available bit in exceptional entry for locking, other two
> - * bits to determine entry type. In total 3 special bits.
> - */
> -#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
> -#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
> -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
> -#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
> -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
> -#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
> -#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
> -		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
> -		RADIX_TREE_EXCEPTIONAL_ENTRY))
> -

Please split the move of these constants into a separate patch.

> -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
> +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
> +		unsigned long new_type)
>  {
> +	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
>  	void *entry, **slot;
>  
>  restart:
>  	spin_lock_irq(&mapping->tree_lock);
>  	entry = get_unlocked_mapping_entry(mapping, index, &slot);
> +
> +	if (entry && new_type == RADIX_DAX_PMD) {
> +		if (!radix_tree_exceptional_entry(entry) ||
> +				RADIX_DAX_TYPE(entry) == RADIX_DAX_PTE) {
> +			spin_unlock_irq(&mapping->tree_lock);
> +			return ERR_PTR(-EEXIST);
> +		}
> +	} else if (entry && new_type == RADIX_DAX_PTE) {
> +		if (radix_tree_exceptional_entry(entry) &&
> +		    RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD &&
> +		    (unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
> +			pmd_downgrade = true;
> +		}
> +	}

	Would be nice to use switch on the type here:

	old_type = RADIX_DAX_TYPE(entry);

	if (entry) {
		switch (new_type) {
		case RADIX_DAX_PMD:
			if (!radix_tree_exceptional_entry(entry) ||
			    oldentry == RADIX_DAX_PTE) {
			    	entry = ERR_PTR(-EEXIST);
				goto out_unlock;
			}
			break;
		case RADIX_DAX_PTE:
			if (radix_tree_exceptional_entry(entry) &&
			    old_entry = RADIX_DAX_PMD &&
			    (unsigned long)entry & 
			      (RADIX_DAX_HZP|RADIX_DAX_EMPTY))
			      	..

Btw, why are only RADIX_DAX_PTE and RADIX_DAX_PMD in the type mask,
and not RADIX_DAX_HZP and RADIX_DAX_EMPTY?  With that we could use the
above old_entry local variable over this function and make it a lot les
of a mess.

>  static void *dax_insert_mapping_entry(struct address_space *mapping,
>  				      struct vm_fault *vmf,
> -				      void *entry, sector_t sector)
> +				      void *entry, sector_t sector,
> +				      unsigned long new_type, bool hzp)

And then we could also drop the hzp argument here..

>  #ifdef CONFIG_FS_IOMAP
> +static inline sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
> +{
> +	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
> +}

Please split adding this new helper into a separate patch.

> +#if defined(CONFIG_FS_DAX_PMD)

Please use #ifdef here.

> +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
> +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
> +
> +/* entries begin locked */
> +#define RADIX_DAX_ENTRY(sector, type) ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |\
> +	type | (unsigned long)sector << RADIX_DAX_SHIFT | RADIX_DAX_ENTRY_LOCK))
> +#define RADIX_DAX_HZP_ENTRY() ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | \
> +	RADIX_DAX_PMD | RADIX_DAX_HZP | RADIX_DAX_EMPTY | RADIX_DAX_ENTRY_LOCK))
> +#define RADIX_DAX_EMPTY_ENTRY(type) ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | \
> +		type | RADIX_DAX_EMPTY | RADIX_DAX_ENTRY_LOCK))
> +
> +#define RADIX_DAX_ORDER(type) (type == RADIX_DAX_PMD ? PMD_SHIFT-PAGE_SHIFT : 0)

All these macros don't properly brace their arguments.  I think
you'd make your life a lot easier by making them inline functions.

> +#if defined(CONFIG_FS_DAX_PMD)

#ifdef, please
Jan Kara Oct. 3, 2016, 10:59 a.m. UTC | #2
On Thu 29-09-16 16:49:28, Ross Zwisler wrote:
> DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
> locking.  This patch allows DAX PMDs to participate in the DAX radix tree
> based locking scheme so that they can be re-enabled using the new struct
> iomap based fault handlers.
> 
> There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
> mappings that have an associated block allocation, and 4k DAX empty
> entries.  The empty entries exist to provide locking for the duration of a
> given page fault.
> 
> This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
> entries, PMD DAX entries that have associated block allocations, and 2 MiB
> DAX empty entries.
> 
> Unlike the 4k case where we insert a struct page* into the radix tree for
> 4k zero pages, for HZP we insert a DAX exceptional entry with the new
> RADIX_DAX_HZP flag set.  This is because we use a single 2 MiB zero page in
> every 2MiB hole mapping, and it doesn't make sense to have that same struct
> page* with multiple entries in multiple trees.  This would cause contention
> on the single page lock for the one Huge Zero Page, and it would break the
> page->index and page->mapping associations that are assumed to be valid in
> many other places in the kernel.
> 
> One difficult use case is when one thread is trying to use 4k entries in
> radix tree for a given offset, and another thread is using 2 MiB entries
> for that same offset.  The current code handles this by making the 2 MiB
> user fall back to 4k entries for most cases.  This was done because it is
> the simplest solution, and because the use of 2MiB pages is already
> opportunistic.
> 
> If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
> we run into the problem of how we lock out 4k page faults for the entire
> 2MiB range while we clean out the radix tree so we can insert the 2MiB
> entry.  We can solve this problem if we need to, but I think that the cases
> where both 2MiB entries and 4K entries are being used for the same range
> will be rare enough and the gain small enough that it probably won't be
> worth the complexity.
...
>  restart:
>  	spin_lock_irq(&mapping->tree_lock);
>  	entry = get_unlocked_mapping_entry(mapping, index, &slot);
> +
> +	if (entry && new_type == RADIX_DAX_PMD) {
> +		if (!radix_tree_exceptional_entry(entry) ||
> +				RADIX_DAX_TYPE(entry) == RADIX_DAX_PTE) {
> +			spin_unlock_irq(&mapping->tree_lock);
> +			return ERR_PTR(-EEXIST);
> +		}
> +	} else if (entry && new_type == RADIX_DAX_PTE) {
> +		if (radix_tree_exceptional_entry(entry) &&
> +		    RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD &&
> +		    (unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
> +			pmd_downgrade = true;
> +		}
> +	}
> +
>  	/* No entry for given index? Make sure radix tree is big enough. */
> -	if (!entry) {
> +	if (!entry || pmd_downgrade) {
>  		int err;
>  
>  		spin_unlock_irq(&mapping->tree_lock);
> @@ -420,15 +439,39 @@ restart:
>  				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
>  		if (err)
>  			return ERR_PTR(err);
> -		entry = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
> -			       RADIX_DAX_ENTRY_LOCK);
> +
> +		/*
> +		 * Besides huge zero pages the only other thing that gets
> +		 * downgraded are empty entries which don't need to be
> +		 * unmapped.
> +		 */
> +		if (pmd_downgrade && ((unsigned long)entry & RADIX_DAX_HZP))
> +			unmap_mapping_range(mapping,
> +				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
> +
>  		spin_lock_irq(&mapping->tree_lock);
> -		err = radix_tree_insert(&mapping->page_tree, index, entry);
> +
> +		if (pmd_downgrade) {
> +			radix_tree_delete(&mapping->page_tree, index);
> +			mapping->nrexceptional--;
> +			dax_wake_mapping_entry_waiter(entry, mapping, index,
> +					false);
> +		}

Hum, this looks really problematic. Once you have dropped tree_lock,
anything could have happened with the radix tree - in particular the entry
you've got from get_unlocked_mapping_entry() can be different by now. Also
there's no guarantee that someone does not map the huge entry again just
after your call to unmap_mapping_range() finished.

So it seems you need to lock the entry (if you have one) before releasing
tree_lock to stabilize it. That is enough also to block other mappings of
that entry. Then once you reacquire the tree_lock, you can safely delete it
and replace it with a different entry.

> +
> +		entry = RADIX_DAX_EMPTY_ENTRY(new_type);
> +
> +		err = __radix_tree_insert(&mapping->page_tree, index,
> +				RADIX_DAX_ORDER(new_type), entry);
>  		radix_tree_preload_end();
>  		if (err) {
>  			spin_unlock_irq(&mapping->tree_lock);
> -			/* Someone already created the entry? */
> -			if (err == -EEXIST)
> +			/*
> +			 * Someone already created the entry?  This is a
> +			 * normal failure when inserting PMDs in a range
> +			 * that already contains PTEs.  In that case we want
> +			 * to return -EEXIST immediately.
> +			 */
> +			if (err == -EEXIST && new_type == RADIX_DAX_PTE)
>  				goto restart;
>  			return ERR_PTR(err);
>  		}
> @@ -596,11 +639,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
>  	return 0;
>  }
>  
> -#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
> -
> +/*
> + * By this point grab_mapping_entry() has ensured that we have a locked entry
> + * of the appropriate size so we don't have to worry about downgrading PMDs to
> + * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
> + * already in the tree, we will skip the insertion and just dirty the PMD as
> + * appropriate.
> + */
>  static void *dax_insert_mapping_entry(struct address_space *mapping,
>  				      struct vm_fault *vmf,
> -				      void *entry, sector_t sector)
> +				      void *entry, sector_t sector,
> +				      unsigned long new_type, bool hzp)
>  {
>  	struct radix_tree_root *page_tree = &mapping->page_tree;
>  	int error = 0;
> @@ -623,22 +672,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
>  		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
>  		if (error)
>  			return ERR_PTR(error);
> +	} else if ((unsigned long)entry & RADIX_DAX_HZP && !hzp) {
> +		/* replacing huge zero page with PMD block mapping */
> +		unmap_mapping_range(mapping,
> +			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
>  	}
>  
>  	spin_lock_irq(&mapping->tree_lock);
> -	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
> -		       RADIX_DAX_ENTRY_LOCK);
> +	if (hzp)
> +		new_entry = RADIX_DAX_HZP_ENTRY();
> +	else
> +		new_entry = RADIX_DAX_ENTRY(sector, new_type);
> +
>  	if (hole_fill) {
>  		__delete_from_page_cache(entry, NULL);
>  		/* Drop pagecache reference */
>  		put_page(entry);
> -		error = radix_tree_insert(page_tree, index, new_entry);
> +		error = __radix_tree_insert(page_tree, index,
> +				RADIX_DAX_ORDER(new_type), new_entry);
>  		if (error) {
>  			new_entry = ERR_PTR(error);
>  			goto unlock;
>  		}
>  		mapping->nrexceptional++;
> -	} else {
> +	} else if ((unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
>  		void **slot;
>  		void *ret;

Hum, I somewhat dislike how PTE and PMD paths differ here. But it's OK for
now I guess. Long term we might be better off to do away with zero pages
for PTEs as well and use exceptional entry and a single zero page like you
do for PMD. Because the special cases these zero pages cause are a
headache.

>  
> @@ -694,6 +751,18 @@ static int dax_writeback_one(struct block_device *bdev,
>  		goto unlock;
>  	}
>  
> +	if (WARN_ON_ONCE((unsigned long)entry & RADIX_DAX_EMPTY)) {
> +		ret = -EIO;
> +		goto unlock;
> +	}
> +
> +	/*
> +	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
> +	 * in the middle of a PMD, the 'index' we are given will be aligned to
> +	 * the start index of the PMD, as will the sector we pull from
> +	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
> +	 * worry about partial PMD writebacks.
> +	 */
>  	dax.sector = RADIX_DAX_SECTOR(entry);
>  	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
>  	spin_unlock_irq(&mapping->tree_lock);

<snip>

> +int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> +		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
> +{
> +	struct address_space *mapping = vma->vm_file->f_mapping;
> +	unsigned long pmd_addr = address & PMD_MASK;
> +	bool write = flags & FAULT_FLAG_WRITE;
> +	struct inode *inode = mapping->host;
> +	struct iomap iomap = { 0 };
> +	int error, result = 0;
> +	pgoff_t size, pgoff;
> +	struct vm_fault vmf;
> +	void *entry;
> +	loff_t pos;
> +
> +	/* Fall back to PTEs if we're going to COW */
> +	if (write && !(vma->vm_flags & VM_SHARED)) {
> +		split_huge_pmd(vma, pmd, address);
> +		return VM_FAULT_FALLBACK;
> +	}
> +
> +	/* If the PMD would extend outside the VMA */
> +	if (pmd_addr < vma->vm_start)
> +		return VM_FAULT_FALLBACK;
> +	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
> +		return VM_FAULT_FALLBACK;
> +
> +	/*
> +	 * Check whether offset isn't beyond end of file now. Caller is
> +	 * supposed to hold locks serializing us with truncate / punch hole so
> +	 * this is a reliable test.
> +	 */
> +	pgoff = linear_page_index(vma, pmd_addr);
> +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +
> +	if (pgoff >= size)
> +		return VM_FAULT_SIGBUS;
> +
> +	/* If the PMD would extend beyond the file size */
> +	if ((pgoff | PG_PMD_COLOUR) >= size)
> +		return VM_FAULT_FALLBACK;
> +
> +	/*
> +	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
> +	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
> +	 * the tree, for instance), it will return -EEXIST and we just fall
> +	 * back to 4k entries.
> +	 */
> +	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
> +	if (IS_ERR(entry))
> +		return VM_FAULT_FALLBACK;
> +
> +	/*
> +	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
> +	 * setting up a mapping, so really we're using iomap_begin() as a way
> +	 * to look up our filesystem block.
> +	 */
> +	pos = (loff_t)pgoff << PAGE_SHIFT;
> +	error = ops->iomap_begin(inode, pos, PMD_SIZE, write ? IOMAP_WRITE : 0,
> +			&iomap);

I'm not quite sure if it is OK to call ->iomap_begin() without ever calling
->iomap_end. Specifically the comment before iomap_apply() says:

"It is assumed that the filesystems will lock whatever resources they
require in the iomap_begin call, and release them in the iomap_end call."

so what you do could result in unbalanced allocations / locks / whatever.
Christoph?

> +	if (error)
> +		goto fallback;
> +	if (iomap.offset + iomap.length < pos + PMD_SIZE)
> +		goto fallback;
> +
> +	vmf.pgoff = pgoff;
> +	vmf.flags = flags;
> +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS | __GFP_IO;

I don't think you want __GFP_FS here - we have already gone through the
filesystem's pmd_fault() handler which called dax_iomap_pmd_fault() and
thus we hold various fs locks, freeze protection, ...

> +
> +	switch (iomap.type) {
> +	case IOMAP_MAPPED:
> +		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
> +				&iomap, pos, write, &entry);
> +		break;
> +	case IOMAP_UNWRITTEN:
> +	case IOMAP_HOLE:
> +		if (WARN_ON_ONCE(write))
> +			goto fallback;
> +		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
> +				&entry);
> +		break;
> +	default:
> +		WARN_ON_ONCE(1);
> +		result = VM_FAULT_FALLBACK;
> +		break;
> +	}
> +
> +	if (result == VM_FAULT_FALLBACK)
> +		count_vm_event(THP_FAULT_FALLBACK);
> +
> + unlock_entry:
> +	put_locked_mapping_entry(mapping, pgoff, entry);
> +	return result;
> +
> + fallback:
> +	count_vm_event(THP_FAULT_FALLBACK);
> +	result = VM_FAULT_FALLBACK;
> +	goto unlock_entry;
> +}
> +EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
> +#endif /* CONFIG_FS_DAX_PMD */
>  #endif /* CONFIG_FS_IOMAP */
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index c4a51bb..cacff9e 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -8,8 +8,33 @@
>  
>  struct iomap_ops;
>  
> -/* We use lowest available exceptional entry bit for locking */
> +/*
> + * We use lowest available bit in exceptional entry for locking, two bits for
> + * the entry type (PMD & PTE), and two more for flags (HZP and empty).  In
> + * total five special bits.
> + */
> +#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 5)
>  #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
> +/* PTE and PMD types */
> +#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
> +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
> +/* huge zero page and empty entry flags */
> +#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
> +#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 4))

I think we can do with just 2 bits for type instead of 4 but for now this
is OK I guess.

								Honza
Christoph Hellwig Oct. 3, 2016, 4:37 p.m. UTC | #3
On Mon, Oct 03, 2016 at 12:59:49PM +0200, Jan Kara wrote:
> I'm not quite sure if it is OK to call ->iomap_begin() without ever calling
> ->iomap_end. Specifically the comment before iomap_apply() says:
> 
> "It is assumed that the filesystems will lock whatever resources they
> require in the iomap_begin call, and release them in the iomap_end call."
> 
> so what you do could result in unbalanced allocations / locks / whatever.
> Christoph?

Indeed.  For XFS we only rely on iomap_end for error handling at the
moment, but it is intended to be paired for locking, as cluster file
systems like gfs2 requested this.
Ross Zwisler Oct. 3, 2016, 9:05 p.m. UTC | #4
On Mon, Oct 03, 2016 at 12:59:49PM +0200, Jan Kara wrote:
> On Thu 29-09-16 16:49:28, Ross Zwisler wrote:
> > @@ -420,15 +439,39 @@ restart:
> >  				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
> >  		if (err)
> >  			return ERR_PTR(err);
> > -		entry = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
> > -			       RADIX_DAX_ENTRY_LOCK);
> > +
> > +		/*
> > +		 * Besides huge zero pages the only other thing that gets
> > +		 * downgraded are empty entries which don't need to be
> > +		 * unmapped.
> > +		 */
> > +		if (pmd_downgrade && ((unsigned long)entry & RADIX_DAX_HZP))
> > +			unmap_mapping_range(mapping,
> > +				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
> > +
> >  		spin_lock_irq(&mapping->tree_lock);
> > -		err = radix_tree_insert(&mapping->page_tree, index, entry);
> > +
> > +		if (pmd_downgrade) {
> > +			radix_tree_delete(&mapping->page_tree, index);
> > +			mapping->nrexceptional--;
> > +			dax_wake_mapping_entry_waiter(entry, mapping, index,
> > +					false);
> > +		}
> 
> Hum, this looks really problematic. Once you have dropped tree_lock,
> anything could have happened with the radix tree - in particular the entry
> you've got from get_unlocked_mapping_entry() can be different by now. Also
> there's no guarantee that someone does not map the huge entry again just
> after your call to unmap_mapping_range() finished.
> 
> So it seems you need to lock the entry (if you have one) before releasing
> tree_lock to stabilize it. That is enough also to block other mappings of
> that entry. Then once you reacquire the tree_lock, you can safely delete it
> and replace it with a different entry.

Yep, great catch.  I'll lock the entry before I drop tree_lock.

> > @@ -623,22 +672,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
> >  		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
> >  		if (error)
> >  			return ERR_PTR(error);
> > +	} else if ((unsigned long)entry & RADIX_DAX_HZP && !hzp) {
> > +		/* replacing huge zero page with PMD block mapping */
> > +		unmap_mapping_range(mapping,
> > +			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
> >  	}
> >  
> >  	spin_lock_irq(&mapping->tree_lock);
> > -	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
> > -		       RADIX_DAX_ENTRY_LOCK);
> > +	if (hzp)
> > +		new_entry = RADIX_DAX_HZP_ENTRY();
> > +	else
> > +		new_entry = RADIX_DAX_ENTRY(sector, new_type);
> > +
> >  	if (hole_fill) {
> >  		__delete_from_page_cache(entry, NULL);
> >  		/* Drop pagecache reference */
> >  		put_page(entry);
> > -		error = radix_tree_insert(page_tree, index, new_entry);
> > +		error = __radix_tree_insert(page_tree, index,
> > +				RADIX_DAX_ORDER(new_type), new_entry);
> >  		if (error) {
> >  			new_entry = ERR_PTR(error);
> >  			goto unlock;
> >  		}
> >  		mapping->nrexceptional++;
> > -	} else {
> > +	} else if ((unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
> >  		void **slot;
> >  		void *ret;
> 
> Hum, I somewhat dislike how PTE and PMD paths differ here. But it's OK for
> now I guess. Long term we might be better off to do away with zero pages
> for PTEs as well and use exceptional entry and a single zero page like you
> do for PMD. Because the special cases these zero pages cause are a
> headache.

I've been thinking about this as well, and I do think we'd be better off with
a single zero page for PTEs, as we have with PMDs.  It'd reduce the special
casing in the DAX code, and it'd also ensure that we don't waste a bunch of
time and memory creating read-only zero pages to service reads from holes.

I'll look into adding this for v5.

> > +int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> > +		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
> > +{
> > +	struct address_space *mapping = vma->vm_file->f_mapping;
> > +	unsigned long pmd_addr = address & PMD_MASK;
> > +	bool write = flags & FAULT_FLAG_WRITE;
> > +	struct inode *inode = mapping->host;
> > +	struct iomap iomap = { 0 };
> > +	int error, result = 0;
> > +	pgoff_t size, pgoff;
> > +	struct vm_fault vmf;
> > +	void *entry;
> > +	loff_t pos;
> > +
> > +	/* Fall back to PTEs if we're going to COW */
> > +	if (write && !(vma->vm_flags & VM_SHARED)) {
> > +		split_huge_pmd(vma, pmd, address);
> > +		return VM_FAULT_FALLBACK;
> > +	}
> > +
> > +	/* If the PMD would extend outside the VMA */
> > +	if (pmd_addr < vma->vm_start)
> > +		return VM_FAULT_FALLBACK;
> > +	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
> > +		return VM_FAULT_FALLBACK;
> > +
> > +	/*
> > +	 * Check whether offset isn't beyond end of file now. Caller is
> > +	 * supposed to hold locks serializing us with truncate / punch hole so
> > +	 * this is a reliable test.
> > +	 */
> > +	pgoff = linear_page_index(vma, pmd_addr);
> > +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> > +
> > +	if (pgoff >= size)
> > +		return VM_FAULT_SIGBUS;
> > +
> > +	/* If the PMD would extend beyond the file size */
> > +	if ((pgoff | PG_PMD_COLOUR) >= size)
> > +		return VM_FAULT_FALLBACK;
> > +
> > +	/*
> > +	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
> > +	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
> > +	 * the tree, for instance), it will return -EEXIST and we just fall
> > +	 * back to 4k entries.
> > +	 */
> > +	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
> > +	if (IS_ERR(entry))
> > +		return VM_FAULT_FALLBACK;
> > +
> > +	/*
> > +	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
> > +	 * setting up a mapping, so really we're using iomap_begin() as a way
> > +	 * to look up our filesystem block.
> > +	 */
> > +	pos = (loff_t)pgoff << PAGE_SHIFT;
> > +	error = ops->iomap_begin(inode, pos, PMD_SIZE, write ? IOMAP_WRITE : 0,
> > +			&iomap);
> 
> I'm not quite sure if it is OK to call ->iomap_begin() without ever calling
> ->iomap_end. Specifically the comment before iomap_apply() says:
> 
> "It is assumed that the filesystems will lock whatever resources they
> require in the iomap_begin call, and release them in the iomap_end call."
> 
> so what you do could result in unbalanced allocations / locks / whatever.
> Christoph?

I'll add the iomap_end() calls to both the PTE and PMD iomap fault handlers.

> > +	if (error)
> > +		goto fallback;
> > +	if (iomap.offset + iomap.length < pos + PMD_SIZE)
> > +		goto fallback;
> > +
> > +	vmf.pgoff = pgoff;
> > +	vmf.flags = flags;
> > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS | __GFP_IO;
> 
> I don't think you want __GFP_FS here - we have already gone through the
> filesystem's pmd_fault() handler which called dax_iomap_pmd_fault() and
> thus we hold various fs locks, freeze protection, ...

I copied this from __get_fault_gfp_mask() in mm/memory.c.  That function is
used by do_page_mkwrite() and __do_fault(), and we eventually get this
vmf->gfp_mask in the PTE fault code.  With the code as it is we get the same
vmf->gfp_mask in both dax_iomap_fault() and dax_iomap_pmd_fault().  It seems
like they should remain consistent - is it wrong to have __GFP_FS in
dax_iomap_fault()?

> > diff --git a/include/linux/dax.h b/include/linux/dax.h
> > index c4a51bb..cacff9e 100644
> > --- a/include/linux/dax.h
> > +++ b/include/linux/dax.h
> > @@ -8,8 +8,33 @@
> >  
> >  struct iomap_ops;
> >  
> > -/* We use lowest available exceptional entry bit for locking */
> > +/*
> > + * We use lowest available bit in exceptional entry for locking, two bits for
> > + * the entry type (PMD & PTE), and two more for flags (HZP and empty).  In
> > + * total five special bits.
> > + */
> > +#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 5)
> >  #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
> > +/* PTE and PMD types */
> > +#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
> > +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
> > +/* huge zero page and empty entry flags */
> > +#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
> > +#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 4))
> 
> I think we can do with just 2 bits for type instead of 4 but for now this
> is OK I guess.

I guess we could combine the PMD/PTE choice into the same bit (0=PTE, 1=PMD),
but we have three cases for the other types (zero page, empty entry just for
locking, real DAX based entry with storage), so we need at least 2 bits for
those.

Christoph also suggested some reworks to the "type" logic - I'll look at
simplifying the way the flags are used for DAX entries.

Thank you for the review!
Ross Zwisler Oct. 3, 2016, 9:16 p.m. UTC | #5
On Fri, Sep 30, 2016 at 02:56:27AM -0700, Christoph Hellwig wrote:
> > -/*
> > - * We use lowest available bit in exceptional entry for locking, other two
> > - * bits to determine entry type. In total 3 special bits.
> > - */
> > -#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
> > -#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
> > -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
> > -#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
> > -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
> > -#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
> > -#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
> > -		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
> > -		RADIX_TREE_EXCEPTIONAL_ENTRY))
> > -
> 
> Please split the move of these constants into a separate patch.

Will do for v5.

> > -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
> > +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
> > +		unsigned long new_type)
> >  {
> > +	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
> >  	void *entry, **slot;
> >  
> >  restart:
> >  	spin_lock_irq(&mapping->tree_lock);
> >  	entry = get_unlocked_mapping_entry(mapping, index, &slot);
> > +
> > +	if (entry && new_type == RADIX_DAX_PMD) {
> > +		if (!radix_tree_exceptional_entry(entry) ||
> > +				RADIX_DAX_TYPE(entry) == RADIX_DAX_PTE) {
> > +			spin_unlock_irq(&mapping->tree_lock);
> > +			return ERR_PTR(-EEXIST);
> > +		}
> > +	} else if (entry && new_type == RADIX_DAX_PTE) {
> > +		if (radix_tree_exceptional_entry(entry) &&
> > +		    RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD &&
> > +		    (unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
> > +			pmd_downgrade = true;
> > +		}
> > +	}
> 
> 	Would be nice to use switch on the type here:
> 
> 	old_type = RADIX_DAX_TYPE(entry);
> 
> 	if (entry) {
> 		switch (new_type) {
> 		case RADIX_DAX_PMD:
> 			if (!radix_tree_exceptional_entry(entry) ||
> 			    oldentry == RADIX_DAX_PTE) {
> 			    	entry = ERR_PTR(-EEXIST);
> 				goto out_unlock;
> 			}
> 			break;
> 		case RADIX_DAX_PTE:
> 			if (radix_tree_exceptional_entry(entry) &&
> 			    old_entry = RADIX_DAX_PMD &&
> 			    (unsigned long)entry & 
> 			      (RADIX_DAX_HZP|RADIX_DAX_EMPTY))
> 			      	..

Will do.  This definitely makes things less dense and more readable.

> Btw, why are only RADIX_DAX_PTE and RADIX_DAX_PMD in the type mask,
> and not RADIX_DAX_HZP and RADIX_DAX_EMPTY?  With that we could use the
> above old_entry local variable over this function and make it a lot les
> of a mess.

So essentially in this revision of the series 'type' just means PMD or PTE
(perhaps this should have been 'size' or something), and the Zero Page / Empty
Entry / Regular DAX choice was something separate.  Jan also commented on
this, and I agree there's room for improvement.  Let me try and rework it a
bit for v5.

> >  static void *dax_insert_mapping_entry(struct address_space *mapping,
> >  				      struct vm_fault *vmf,
> > -				      void *entry, sector_t sector)
> > +				      void *entry, sector_t sector,
> > +				      unsigned long new_type, bool hzp)
> 
> And then we could also drop the hzp argument here..

Yep, that would be good.

> >  #ifdef CONFIG_FS_IOMAP
> > +static inline sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
> > +{
> > +	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
> > +}
> 
> Please split adding this new helper into a separate patch.

Done for v5.

> > +#if defined(CONFIG_FS_DAX_PMD)
> 
> Please use #ifdef here.

Will do.

> > +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
> > +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
> > +
> > +/* entries begin locked */
> > +#define RADIX_DAX_ENTRY(sector, type) ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |\
> > +	type | (unsigned long)sector << RADIX_DAX_SHIFT | RADIX_DAX_ENTRY_LOCK))
> > +#define RADIX_DAX_HZP_ENTRY() ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | \
> > +	RADIX_DAX_PMD | RADIX_DAX_HZP | RADIX_DAX_EMPTY | RADIX_DAX_ENTRY_LOCK))
> > +#define RADIX_DAX_EMPTY_ENTRY(type) ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | \
> > +		type | RADIX_DAX_EMPTY | RADIX_DAX_ENTRY_LOCK))
> > +
> > +#define RADIX_DAX_ORDER(type) (type == RADIX_DAX_PMD ? PMD_SHIFT-PAGE_SHIFT : 0)
> 
> All these macros don't properly brace their arguments.  I think
> you'd make your life a lot easier by making them inline functions.

Oh, great point.  Will do, although depending on how the 'type' thing works
out we may end up with just a dax_radix_entry(sector, type) inline function,
where 'type' can be (RADIX_DAX_PMD | RADIX_DAX_EMPTY), etc.  That would
prevent the need for a bunch of different ways of making a new entry.

> > +#if defined(CONFIG_FS_DAX_PMD)
> 
> #ifdef, please

Will do.

Thank you for the review, Christoph.
Jan Kara Oct. 4, 2016, 5:55 a.m. UTC | #6
On Mon 03-10-16 15:05:57, Ross Zwisler wrote:
> > > @@ -623,22 +672,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
> > >  		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
> > >  		if (error)
> > >  			return ERR_PTR(error);
> > > +	} else if ((unsigned long)entry & RADIX_DAX_HZP && !hzp) {
> > > +		/* replacing huge zero page with PMD block mapping */
> > > +		unmap_mapping_range(mapping,
> > > +			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
> > >  	}
> > >  
> > >  	spin_lock_irq(&mapping->tree_lock);
> > > -	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
> > > -		       RADIX_DAX_ENTRY_LOCK);
> > > +	if (hzp)
> > > +		new_entry = RADIX_DAX_HZP_ENTRY();
> > > +	else
> > > +		new_entry = RADIX_DAX_ENTRY(sector, new_type);
> > > +
> > >  	if (hole_fill) {
> > >  		__delete_from_page_cache(entry, NULL);
> > >  		/* Drop pagecache reference */
> > >  		put_page(entry);
> > > -		error = radix_tree_insert(page_tree, index, new_entry);
> > > +		error = __radix_tree_insert(page_tree, index,
> > > +				RADIX_DAX_ORDER(new_type), new_entry);
> > >  		if (error) {
> > >  			new_entry = ERR_PTR(error);
> > >  			goto unlock;
> > >  		}
> > >  		mapping->nrexceptional++;
> > > -	} else {
> > > +	} else if ((unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
> > >  		void **slot;
> > >  		void *ret;
> > 
> > Hum, I somewhat dislike how PTE and PMD paths differ here. But it's OK for
> > now I guess. Long term we might be better off to do away with zero pages
> > for PTEs as well and use exceptional entry and a single zero page like you
> > do for PMD. Because the special cases these zero pages cause are a
> > headache.
> 
> I've been thinking about this as well, and I do think we'd be better off with
> a single zero page for PTEs, as we have with PMDs.  It'd reduce the special
> casing in the DAX code, and it'd also ensure that we don't waste a bunch of
> time and memory creating read-only zero pages to service reads from holes.
> 
> I'll look into adding this for v5.

Well, this would clash with the dirty bit cleaning series I have. So I'd
prefer to put this on a todo list and address it once existing series are
integrated...

> > > +	if (error)
> > > +		goto fallback;
> > > +	if (iomap.offset + iomap.length < pos + PMD_SIZE)
> > > +		goto fallback;
> > > +
> > > +	vmf.pgoff = pgoff;
> > > +	vmf.flags = flags;
> > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS | __GFP_IO;
> > 
> > I don't think you want __GFP_FS here - we have already gone through the
> > filesystem's pmd_fault() handler which called dax_iomap_pmd_fault() and
> > thus we hold various fs locks, freeze protection, ...
> 
> I copied this from __get_fault_gfp_mask() in mm/memory.c.  That function is
> used by do_page_mkwrite() and __do_fault(), and we eventually get this
> vmf->gfp_mask in the PTE fault code.  With the code as it is we get the same
> vmf->gfp_mask in both dax_iomap_fault() and dax_iomap_pmd_fault().  It seems
> like they should remain consistent - is it wrong to have __GFP_FS in
> dax_iomap_fault()?

The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
because at that point it is correct. But once we grab filesystem locks
which are not reclaim safe, we should update vmf->gfp_mask we pass further
down into DAX code to not contain __GFP_FS (that's a bug we apparently have
there). And inside DAX code, we definitely are not generally safe to add
__GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
vm_fault into this function, using passed gfp_mask there and make sure
callers update gfp_mask as appropriate.

								Honza
Ross Zwisler Oct. 4, 2016, 3:39 p.m. UTC | #7
On Tue, Oct 04, 2016 at 07:55:57AM +0200, Jan Kara wrote:
> On Mon 03-10-16 15:05:57, Ross Zwisler wrote:
> > > > @@ -623,22 +672,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
> > > >  		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
> > > >  		if (error)
> > > >  			return ERR_PTR(error);
> > > > +	} else if ((unsigned long)entry & RADIX_DAX_HZP && !hzp) {
> > > > +		/* replacing huge zero page with PMD block mapping */
> > > > +		unmap_mapping_range(mapping,
> > > > +			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
> > > >  	}
> > > >  
> > > >  	spin_lock_irq(&mapping->tree_lock);
> > > > -	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
> > > > -		       RADIX_DAX_ENTRY_LOCK);
> > > > +	if (hzp)
> > > > +		new_entry = RADIX_DAX_HZP_ENTRY();
> > > > +	else
> > > > +		new_entry = RADIX_DAX_ENTRY(sector, new_type);
> > > > +
> > > >  	if (hole_fill) {
> > > >  		__delete_from_page_cache(entry, NULL);
> > > >  		/* Drop pagecache reference */
> > > >  		put_page(entry);
> > > > -		error = radix_tree_insert(page_tree, index, new_entry);
> > > > +		error = __radix_tree_insert(page_tree, index,
> > > > +				RADIX_DAX_ORDER(new_type), new_entry);
> > > >  		if (error) {
> > > >  			new_entry = ERR_PTR(error);
> > > >  			goto unlock;
> > > >  		}
> > > >  		mapping->nrexceptional++;
> > > > -	} else {
> > > > +	} else if ((unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
> > > >  		void **slot;
> > > >  		void *ret;
> > > 
> > > Hum, I somewhat dislike how PTE and PMD paths differ here. But it's OK for
> > > now I guess. Long term we might be better off to do away with zero pages
> > > for PTEs as well and use exceptional entry and a single zero page like you
> > > do for PMD. Because the special cases these zero pages cause are a
> > > headache.
> > 
> > I've been thinking about this as well, and I do think we'd be better off with
> > a single zero page for PTEs, as we have with PMDs.  It'd reduce the special
> > casing in the DAX code, and it'd also ensure that we don't waste a bunch of
> > time and memory creating read-only zero pages to service reads from holes.
> > 
> > I'll look into adding this for v5.
> 
> Well, this would clash with the dirty bit cleaning series I have. So I'd
> prefer to put this on a todo list and address it once existing series are
> integrated...

Sure, that works.

> > > > +	if (error)
> > > > +		goto fallback;
> > > > +	if (iomap.offset + iomap.length < pos + PMD_SIZE)
> > > > +		goto fallback;
> > > > +
> > > > +	vmf.pgoff = pgoff;
> > > > +	vmf.flags = flags;
> > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS | __GFP_IO;
> > > 
> > > I don't think you want __GFP_FS here - we have already gone through the
> > > filesystem's pmd_fault() handler which called dax_iomap_pmd_fault() and
> > > thus we hold various fs locks, freeze protection, ...
> > 
> > I copied this from __get_fault_gfp_mask() in mm/memory.c.  That function is
> > used by do_page_mkwrite() and __do_fault(), and we eventually get this
> > vmf->gfp_mask in the PTE fault code.  With the code as it is we get the same
> > vmf->gfp_mask in both dax_iomap_fault() and dax_iomap_pmd_fault().  It seems
> > like they should remain consistent - is it wrong to have __GFP_FS in
> > dax_iomap_fault()?
> 
> The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> because at that point it is correct. But once we grab filesystem locks
> which are not reclaim safe, we should update vmf->gfp_mask we pass further
> down into DAX code to not contain __GFP_FS (that's a bug we apparently have
> there). And inside DAX code, we definitely are not generally safe to add
> __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> vm_fault into this function, using passed gfp_mask there and make sure
> callers update gfp_mask as appropriate.

Yep, that makes sense to me.  In reviewing your set it also occurred to me that
we might want to stick a struct vm_area_struct *vma pointer in the vmf, since
you always need a vma when you are using a vmf, but we pass them as a pair
everywhere.
Jan Kara Oct. 5, 2016, 5:50 a.m. UTC | #8
On Tue 04-10-16 09:39:48, Ross Zwisler wrote:
> > The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> > because at that point it is correct. But once we grab filesystem locks
> > which are not reclaim safe, we should update vmf->gfp_mask we pass further
> > down into DAX code to not contain __GFP_FS (that's a bug we apparently have
> > there). And inside DAX code, we definitely are not generally safe to add
> > __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> > vm_fault into this function, using passed gfp_mask there and make sure
> > callers update gfp_mask as appropriate.
> 
> Yep, that makes sense to me.  In reviewing your set it also occurred to me that
> we might want to stick a struct vm_area_struct *vma pointer in the vmf, since
> you always need a vma when you are using a vmf, but we pass them as a pair
> everywhere.

Actually, vma pointer will be in struct vm_fault after my DAX
write-protection series. So once that lands, we can clean up whatever
duplicit function parameters...

								Honza
Ross Zwisler Oct. 6, 2016, 9:34 p.m. UTC | #9
On Mon, Oct 03, 2016 at 03:05:57PM -0600, Ross Zwisler wrote:
> On Mon, Oct 03, 2016 at 12:59:49PM +0200, Jan Kara wrote:
> > On Thu 29-09-16 16:49:28, Ross Zwisler wrote:
<>
> > > +int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> > > +		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
> > > +{
> > > +	struct address_space *mapping = vma->vm_file->f_mapping;
> > > +	unsigned long pmd_addr = address & PMD_MASK;
> > > +	bool write = flags & FAULT_FLAG_WRITE;
> > > +	struct inode *inode = mapping->host;
> > > +	struct iomap iomap = { 0 };
> > > +	int error, result = 0;
> > > +	pgoff_t size, pgoff;
> > > +	struct vm_fault vmf;
> > > +	void *entry;
> > > +	loff_t pos;
> > > +
> > > +	/* Fall back to PTEs if we're going to COW */
> > > +	if (write && !(vma->vm_flags & VM_SHARED)) {
> > > +		split_huge_pmd(vma, pmd, address);
> > > +		return VM_FAULT_FALLBACK;
> > > +	}
> > > +
> > > +	/* If the PMD would extend outside the VMA */
> > > +	if (pmd_addr < vma->vm_start)
> > > +		return VM_FAULT_FALLBACK;
> > > +	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
> > > +		return VM_FAULT_FALLBACK;
> > > +
> > > +	/*
> > > +	 * Check whether offset isn't beyond end of file now. Caller is
> > > +	 * supposed to hold locks serializing us with truncate / punch hole so
> > > +	 * this is a reliable test.
> > > +	 */
> > > +	pgoff = linear_page_index(vma, pmd_addr);
> > > +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> > > +
> > > +	if (pgoff >= size)
> > > +		return VM_FAULT_SIGBUS;
> > > +
> > > +	/* If the PMD would extend beyond the file size */
> > > +	if ((pgoff | PG_PMD_COLOUR) >= size)
> > > +		return VM_FAULT_FALLBACK;
> > > +
> > > +	/*
> > > +	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
> > > +	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
> > > +	 * the tree, for instance), it will return -EEXIST and we just fall
> > > +	 * back to 4k entries.
> > > +	 */
> > > +	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
> > > +	if (IS_ERR(entry))
> > > +		return VM_FAULT_FALLBACK;
> > > +
> > > +	/*
> > > +	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
> > > +	 * setting up a mapping, so really we're using iomap_begin() as a way
> > > +	 * to look up our filesystem block.
> > > +	 */
> > > +	pos = (loff_t)pgoff << PAGE_SHIFT;
> > > +	error = ops->iomap_begin(inode, pos, PMD_SIZE, write ? IOMAP_WRITE : 0,
> > > +			&iomap);
> > 
> > I'm not quite sure if it is OK to call ->iomap_begin() without ever calling
> > ->iomap_end. Specifically the comment before iomap_apply() says:
> > 
> > "It is assumed that the filesystems will lock whatever resources they
> > require in the iomap_begin call, and release them in the iomap_end call."
> > 
> > so what you do could result in unbalanced allocations / locks / whatever.
> > Christoph?
> 
> I'll add the iomap_end() calls to both the PTE and PMD iomap fault handlers.

Interesting - adding iomap_end() calls to the DAX PTE fault handler causes an
AA deadlock because we try and retake ei->dax_sem.  We take dax_sem in
ext2_dax_fault() before calling into the DAX code, then if we end up going
through the error path in ext2_iomap_end(), we call 
  ext2_write_failed()
    ext2_truncate_blocks()
      dax_sem_down_write()

Where we try and take dax_sem again.  This error path is really only valid for
I/O operations, but we happen to call it for page faults because 'written' in
ext2_iomap_end() is just 0.

So...how should we handle this?  A few ideas:

1) Just continue to omit the calls to iomap_end() in the DAX page fault
handlers for now, and add them when there is useful work to be done in one of
the filesystems.

2) Add an IOMAP_FAULT flag to the flags passed into iomap_begin() and
iomap_end() so make it explicit that we are calling as part of a fault handler
and not an I/O operation, and use this to adjust the error handling in
ext2_iomap_end().

3) Just work around the existing error handling in ext2_iomap_end() by either
unsetting IOMAP_WRITE or by setting 'written' to the size of the fault.

For #2 or #3, probably add a comment explaining the deadlock and why we need
to never call ext2_write_failed() while handling a page fault.

Thoughts?
Ross Zwisler Oct. 7, 2016, 2:58 a.m. UTC | #10
On Thu, Oct 06, 2016 at 03:34:24PM -0600, Ross Zwisler wrote:
> Interesting - adding iomap_end() calls to the DAX PTE fault handler causes an
> AA deadlock because we try and retake ei->dax_sem.  We take dax_sem in
> ext2_dax_fault() before calling into the DAX code, then if we end up going
> through the error path in ext2_iomap_end(), we call 
>   ext2_write_failed()
>     ext2_truncate_blocks()
>       dax_sem_down_write()
> 
> Where we try and take dax_sem again.  This error path is really only valid for
> I/O operations, but we happen to call it for page faults because 'written' in
> ext2_iomap_end() is just 0.
> 
> So...how should we handle this?  A few ideas:
> 
> 1) Just continue to omit the calls to iomap_end() in the DAX page fault
> handlers for now, and add them when there is useful work to be done in one of
> the filesystems.
> 
> 2) Add an IOMAP_FAULT flag to the flags passed into iomap_begin() and
> iomap_end() so make it explicit that we are calling as part of a fault handler
> and not an I/O operation, and use this to adjust the error handling in
> ext2_iomap_end().
> 
> 3) Just work around the existing error handling in ext2_iomap_end() by either
> unsetting IOMAP_WRITE or by setting 'written' to the size of the fault.
> 
> For #2 or #3, probably add a comment explaining the deadlock and why we need
> to never call ext2_write_failed() while handling a page fault.
> 
> Thoughts?

Never mind, #3 it is, I think it was just a plain bug to call iomap_end() with
'length' != 'written'.
Jan Kara Oct. 7, 2016, 7:24 a.m. UTC | #11
On Thu 06-10-16 20:58:33, Ross Zwisler wrote:
> On Thu, Oct 06, 2016 at 03:34:24PM -0600, Ross Zwisler wrote:
> > Interesting - adding iomap_end() calls to the DAX PTE fault handler causes an
> > AA deadlock because we try and retake ei->dax_sem.  We take dax_sem in
> > ext2_dax_fault() before calling into the DAX code, then if we end up going
> > through the error path in ext2_iomap_end(), we call 
> >   ext2_write_failed()
> >     ext2_truncate_blocks()
> >       dax_sem_down_write()
> > 
> > Where we try and take dax_sem again.  This error path is really only valid for
> > I/O operations, but we happen to call it for page faults because 'written' in
> > ext2_iomap_end() is just 0.
> > 
> > So...how should we handle this?  A few ideas:
> > 
> > 1) Just continue to omit the calls to iomap_end() in the DAX page fault
> > handlers for now, and add them when there is useful work to be done in one of
> > the filesystems.
> > 
> > 2) Add an IOMAP_FAULT flag to the flags passed into iomap_begin() and
> > iomap_end() so make it explicit that we are calling as part of a fault handler
> > and not an I/O operation, and use this to adjust the error handling in
> > ext2_iomap_end().
> > 
> > 3) Just work around the existing error handling in ext2_iomap_end() by either
> > unsetting IOMAP_WRITE or by setting 'written' to the size of the fault.
> > 
> > For #2 or #3, probably add a comment explaining the deadlock and why we need
> > to never call ext2_write_failed() while handling a page fault.
> > 
> > Thoughts?
> 
> Never mind, #3 it is, I think it was just a plain bug to call iomap_end() with
> 'length' != 'written'.

Yup, that's what I'd think as well.

								Honza
diff mbox

Patch

diff --git a/fs/dax.c b/fs/dax.c
index 6977e5e..93a818d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -34,20 +34,6 @@ 
 #include <linux/iomap.h>
 #include "internal.h"
 
-/*
- * We use lowest available bit in exceptional entry for locking, other two
- * bits to determine entry type. In total 3 special bits.
- */
-#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
-#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
-		RADIX_TREE_EXCEPTIONAL_ENTRY))
-
 /* We choose 4096 entries - same as per-zone page wait tables */
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -400,19 +386,52 @@  static void *get_unlocked_mapping_entry(struct address_space *mapping,
  * radix tree entry locked. If the radix tree doesn't contain given index,
  * create empty exceptional entry for the index and return with it locked.
  *
+ * When requesting an entry with type RADIX_DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return an error.  This error will
+ * happen if there are any 4k entries (either zero pages or DAX entries)
+ * within the 2MiB range that we are requesting.
+ *
+ * We always favor 4k entries over 2MiB entries. There isn't a flow where we
+ * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  Also, a 2MiB
+ * insertion will fail if it finds any 4k entries already in the tree, and a
+ * 4k insertion will cause an existing 2MiB entry to be unmapped and
+ * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
+ * well as 2MiB empty entries.
+ *
+ * The exception to this downgrade path is for 2MiB DAX PMD entries that have
+ * real storage backing them.  We will leave these real 2MiB DAX entries in
+ * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ *
  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  * persistent memory the benefit is doubtful. We can add that later if we can
  * show it helps.
  */
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
+		unsigned long new_type)
 {
+	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
 	void *entry, **slot;
 
 restart:
 	spin_lock_irq(&mapping->tree_lock);
 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+
+	if (entry && new_type == RADIX_DAX_PMD) {
+		if (!radix_tree_exceptional_entry(entry) ||
+				RADIX_DAX_TYPE(entry) == RADIX_DAX_PTE) {
+			spin_unlock_irq(&mapping->tree_lock);
+			return ERR_PTR(-EEXIST);
+		}
+	} else if (entry && new_type == RADIX_DAX_PTE) {
+		if (radix_tree_exceptional_entry(entry) &&
+		    RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD &&
+		    (unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
+			pmd_downgrade = true;
+		}
+	}
+
 	/* No entry for given index? Make sure radix tree is big enough. */
-	if (!entry) {
+	if (!entry || pmd_downgrade) {
 		int err;
 
 		spin_unlock_irq(&mapping->tree_lock);
@@ -420,15 +439,39 @@  restart:
 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
 		if (err)
 			return ERR_PTR(err);
-		entry = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-			       RADIX_DAX_ENTRY_LOCK);
+
+		/*
+		 * Besides huge zero pages the only other thing that gets
+		 * downgraded are empty entries which don't need to be
+		 * unmapped.
+		 */
+		if (pmd_downgrade && ((unsigned long)entry & RADIX_DAX_HZP))
+			unmap_mapping_range(mapping,
+				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+
 		spin_lock_irq(&mapping->tree_lock);
-		err = radix_tree_insert(&mapping->page_tree, index, entry);
+
+		if (pmd_downgrade) {
+			radix_tree_delete(&mapping->page_tree, index);
+			mapping->nrexceptional--;
+			dax_wake_mapping_entry_waiter(entry, mapping, index,
+					false);
+		}
+
+		entry = RADIX_DAX_EMPTY_ENTRY(new_type);
+
+		err = __radix_tree_insert(&mapping->page_tree, index,
+				RADIX_DAX_ORDER(new_type), entry);
 		radix_tree_preload_end();
 		if (err) {
 			spin_unlock_irq(&mapping->tree_lock);
-			/* Someone already created the entry? */
-			if (err == -EEXIST)
+			/*
+			 * Someone already created the entry?  This is a
+			 * normal failure when inserting PMDs in a range
+			 * that already contains PTEs.  In that case we want
+			 * to return -EEXIST immediately.
+			 */
+			if (err == -EEXIST && new_type == RADIX_DAX_PTE)
 				goto restart;
 			return ERR_PTR(err);
 		}
@@ -596,11 +639,17 @@  static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
 	return 0;
 }
 
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
-
+/*
+ * By this point grab_mapping_entry() has ensured that we have a locked entry
+ * of the appropriate size so we don't have to worry about downgrading PMDs to
+ * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
+ * already in the tree, we will skip the insertion and just dirty the PMD as
+ * appropriate.
+ */
 static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      struct vm_fault *vmf,
-				      void *entry, sector_t sector)
+				      void *entry, sector_t sector,
+				      unsigned long new_type, bool hzp)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
 	int error = 0;
@@ -623,22 +672,30 @@  static void *dax_insert_mapping_entry(struct address_space *mapping,
 		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
 		if (error)
 			return ERR_PTR(error);
+	} else if ((unsigned long)entry & RADIX_DAX_HZP && !hzp) {
+		/* replacing huge zero page with PMD block mapping */
+		unmap_mapping_range(mapping,
+			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
 	}
 
 	spin_lock_irq(&mapping->tree_lock);
-	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
-		       RADIX_DAX_ENTRY_LOCK);
+	if (hzp)
+		new_entry = RADIX_DAX_HZP_ENTRY();
+	else
+		new_entry = RADIX_DAX_ENTRY(sector, new_type);
+
 	if (hole_fill) {
 		__delete_from_page_cache(entry, NULL);
 		/* Drop pagecache reference */
 		put_page(entry);
-		error = radix_tree_insert(page_tree, index, new_entry);
+		error = __radix_tree_insert(page_tree, index,
+				RADIX_DAX_ORDER(new_type), new_entry);
 		if (error) {
 			new_entry = ERR_PTR(error);
 			goto unlock;
 		}
 		mapping->nrexceptional++;
-	} else {
+	} else if ((unsigned long)entry & (RADIX_DAX_HZP|RADIX_DAX_EMPTY)) {
 		void **slot;
 		void *ret;
 
@@ -694,6 +751,18 @@  static int dax_writeback_one(struct block_device *bdev,
 		goto unlock;
 	}
 
+	if (WARN_ON_ONCE((unsigned long)entry & RADIX_DAX_EMPTY)) {
+		ret = -EIO;
+		goto unlock;
+	}
+
+	/*
+	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
+	 * in the middle of a PMD, the 'index' we are given will be aligned to
+	 * the start index of the PMD, as will the sector we pull from
+	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
+	 * worry about partial PMD writebacks.
+	 */
 	dax.sector = RADIX_DAX_SECTOR(entry);
 	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
 	spin_unlock_irq(&mapping->tree_lock);
@@ -734,12 +803,11 @@  int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
-	void *entry;
 
 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 		return -EIO;
@@ -749,15 +817,6 @@  int dax_writeback_mapping_range(struct address_space *mapping,
 
 	start_index = wbc->range_start >> PAGE_SHIFT;
 	end_index = wbc->range_end >> PAGE_SHIFT;
-	pmd_index = DAX_PMD_INDEX(start_index);
-
-	rcu_read_lock();
-	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
-	rcu_read_unlock();
-
-	/* see if the start of our range is covered by a PMD entry */
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
-		start_index = pmd_index;
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
@@ -802,7 +861,8 @@  static int dax_insert_mapping(struct address_space *mapping,
 		return PTR_ERR(dax.addr);
 	dax_unmap_atomic(bdev, &dax);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector,
+			RADIX_DAX_PTE, false);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 	*entryp = ret;
@@ -849,7 +909,7 @@  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 
-	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	entry = grab_mapping_entry(mapping, vmf->pgoff, RADIX_DAX_PTE);
 	if (IS_ERR(entry)) {
 		error = PTR_ERR(entry);
 		goto out;
@@ -1023,6 +1083,11 @@  int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 EXPORT_SYMBOL_GPL(dax_truncate_page);
 
 #ifdef CONFIG_FS_IOMAP
+static inline sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
+}
+
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
@@ -1048,8 +1113,7 @@  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct blk_dax_ctl dax = { 0 };
 		ssize_t map_len;
 
-		dax.sector = iomap->blkno +
-			(((pos & PAGE_MASK) - iomap->offset) >> 9);
+		dax.sector = dax_iomap_sector(iomap, pos);
 		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
 		map_len = dax_map_atomic(iomap->bdev, &dax);
 		if (map_len < 0) {
@@ -1164,7 +1228,7 @@  int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (pos >= i_size_read(inode))
 		return VM_FAULT_SIGBUS;
 
-	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	entry = grab_mapping_entry(mapping, vmf->pgoff, RADIX_DAX_PTE);
 	if (IS_ERR(entry)) {
 		error = PTR_ERR(entry);
 		goto out;
@@ -1186,7 +1250,7 @@  int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		goto unlock_entry;
 	}
 
-	sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+	sector = dax_iomap_sector(&iomap, pos);
 
 	if (vmf->cow_page) {
 		switch (iomap.type) {
@@ -1246,4 +1310,184 @@  int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	return VM_FAULT_NOPAGE | major;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+#if defined(CONFIG_FS_DAX_PMD)
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
+		struct vm_fault *vmf, unsigned long address,
+		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct block_device *bdev = iomap->bdev;
+	struct blk_dax_ctl dax = {
+		.sector = dax_iomap_sector(iomap, pos),
+		.size = PMD_SIZE,
+	};
+	long length = dax_map_atomic(bdev, &dax);
+	void *ret;
+
+	if (length < 0) /* dax_map_atomic() failed */
+		return VM_FAULT_FALLBACK;
+	if (length < PMD_SIZE)
+		goto unmap_fallback;
+	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
+		goto unmap_fallback;
+	if (!pfn_t_devmap(dax.pfn))
+		goto unmap_fallback;
+
+	dax_unmap_atomic(bdev, &dax);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp,
+			dax.sector, RADIX_DAX_PMD, false);
+	if (IS_ERR(ret))
+		return VM_FAULT_FALLBACK;
+	*entryp = ret;
+
+	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+
+ unmap_fallback:
+	dax_unmap_atomic(bdev, &dax);
+	return VM_FAULT_FALLBACK;
+}
+
+static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
+		struct vm_fault *vmf, unsigned long address,
+		struct iomap *iomap, void **entryp)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long pmd_addr = address & PMD_MASK;
+	struct page *zero_page;
+	spinlock_t *ptl;
+	pmd_t pmd_entry;
+	void *ret;
+
+	zero_page = get_huge_zero_page();
+
+	if (unlikely(!zero_page))
+		return VM_FAULT_FALLBACK;
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp,
+			0, RADIX_DAX_PMD, true);
+	if (IS_ERR(ret))
+		return VM_FAULT_FALLBACK;
+	*entryp = ret;
+
+	ptl = pmd_lock(vma->vm_mm, pmd);
+	if (!pmd_none(*pmd)) {
+		spin_unlock(ptl);
+		return VM_FAULT_FALLBACK;
+	}
+
+	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = pmd_mkhuge(pmd_entry);
+	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	spin_unlock(ptl);
+	return VM_FAULT_NOPAGE;
+}
+
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long pmd_addr = address & PMD_MASK;
+	bool write = flags & FAULT_FLAG_WRITE;
+	struct inode *inode = mapping->host;
+	struct iomap iomap = { 0 };
+	int error, result = 0;
+	pgoff_t size, pgoff;
+	struct vm_fault vmf;
+	void *entry;
+	loff_t pos;
+
+	/* Fall back to PTEs if we're going to COW */
+	if (write && !(vma->vm_flags & VM_SHARED)) {
+		split_huge_pmd(vma, pmd, address);
+		return VM_FAULT_FALLBACK;
+	}
+
+	/* If the PMD would extend outside the VMA */
+	if (pmd_addr < vma->vm_start)
+		return VM_FAULT_FALLBACK;
+	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+		return VM_FAULT_FALLBACK;
+
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is
+	 * supposed to hold locks serializing us with truncate / punch hole so
+	 * this is a reliable test.
+	 */
+	pgoff = linear_page_index(vma, pmd_addr);
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	if (pgoff >= size)
+		return VM_FAULT_SIGBUS;
+
+	/* If the PMD would extend beyond the file size */
+	if ((pgoff | PG_PMD_COLOUR) >= size)
+		return VM_FAULT_FALLBACK;
+
+	/*
+	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
+	 * the tree, for instance), it will return -EEXIST and we just fall
+	 * back to 4k entries.
+	 */
+	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+	if (IS_ERR(entry))
+		return VM_FAULT_FALLBACK;
+
+	/*
+	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
+	 * setting up a mapping, so really we're using iomap_begin() as a way
+	 * to look up our filesystem block.
+	 */
+	pos = (loff_t)pgoff << PAGE_SHIFT;
+	error = ops->iomap_begin(inode, pos, PMD_SIZE, write ? IOMAP_WRITE : 0,
+			&iomap);
+	if (error)
+		goto fallback;
+	if (iomap.offset + iomap.length < pos + PMD_SIZE)
+		goto fallback;
+
+	vmf.pgoff = pgoff;
+	vmf.flags = flags;
+	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS | __GFP_IO;
+
+	switch (iomap.type) {
+	case IOMAP_MAPPED:
+		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
+				&iomap, pos, write, &entry);
+		break;
+	case IOMAP_UNWRITTEN:
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(write))
+			goto fallback;
+		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
+				&entry);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		result = VM_FAULT_FALLBACK;
+		break;
+	}
+
+	if (result == VM_FAULT_FALLBACK)
+		count_vm_event(THP_FAULT_FALLBACK);
+
+ unlock_entry:
+	put_locked_mapping_entry(mapping, pgoff, entry);
+	return result;
+
+ fallback:
+	count_vm_event(THP_FAULT_FALLBACK);
+	result = VM_FAULT_FALLBACK;
+	goto unlock_entry;
+}
+EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#endif /* CONFIG_FS_DAX_PMD */
 #endif /* CONFIG_FS_IOMAP */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index c4a51bb..cacff9e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -8,8 +8,33 @@ 
 
 struct iomap_ops;
 
-/* We use lowest available exceptional entry bit for locking */
+/*
+ * We use lowest available bit in exceptional entry for locking, two bits for
+ * the entry type (PMD & PTE), and two more for flags (HZP and empty).  In
+ * total five special bits.
+ */
+#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 5)
 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+/* PTE and PMD types */
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+/* huge zero page and empty entry flags */
+#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 4))
+
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+
+/* entries begin locked */
+#define RADIX_DAX_ENTRY(sector, type) ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |\
+	type | (unsigned long)sector << RADIX_DAX_SHIFT | RADIX_DAX_ENTRY_LOCK))
+#define RADIX_DAX_HZP_ENTRY() ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | \
+	RADIX_DAX_PMD | RADIX_DAX_HZP | RADIX_DAX_EMPTY | RADIX_DAX_ENTRY_LOCK))
+#define RADIX_DAX_EMPTY_ENTRY(type) ((void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | \
+		type | RADIX_DAX_EMPTY | RADIX_DAX_ENTRY_LOCK))
+
+#define RADIX_DAX_ORDER(type) (type == RADIX_DAX_PMD ? PMD_SHIFT-PAGE_SHIFT : 0)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);
@@ -54,6 +79,17 @@  static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	return VM_FAULT_FALLBACK;
 }
 
+#if defined(CONFIG_FS_DAX_PMD)
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+#else
+static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
+		unsigned long address, pmd_t *pmd, unsigned int flags,
+		struct iomap_ops *ops)
+{
+	return VM_FAULT_FALLBACK;
+}
+#endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 35e880d..d9dd97e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -610,9 +610,7 @@  static int page_cache_tree_insert(struct address_space *mapping,
 				workingset_node_shadows_dec(node);
 		} else {
 			/* DAX can replace empty locked entry with a hole */
-			WARN_ON_ONCE(p !=
-				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-					 RADIX_DAX_ENTRY_LOCK));
+			WARN_ON_ONCE(p != RADIX_DAX_EMPTY_ENTRY(RADIX_DAX_PTE));
 			/* DAX accounts exceptional entries as normal pages */
 			if (node)
 				workingset_node_pages_dec(node);