diff mbox

[RFC,2/2] dax: use range_lock instead of i_mmap_lock

Message ID 1439219664-88088-3-git-send-email-kirill.shutemov@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kirill A . Shutemov Aug. 10, 2015, 3:14 p.m. UTC
As we don't have struct pages for DAX memory, Matthew had to find an
replacement for lock_page() to avoid fault vs. truncate races.
i_mmap_lock was used for that.

Recently, Matthew had to convert locking to exclusive to address fault
vs. fault races. And this kills scalability completely.

The patch below tries to recover some scalability for DAX by introducing
per-mapping range lock.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 fs/dax.c           | 30 +++++++++++++++++-------------
 fs/inode.c         |  1 +
 include/linux/fs.h |  2 ++
 mm/memory.c        | 35 +++++++++++++++++++++++------------
 mm/rmap.c          |  1 +
 5 files changed, 44 insertions(+), 25 deletions(-)

Comments

Jan Kara Aug. 11, 2015, 8:19 a.m. UTC | #1
On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
> As we don't have struct pages for DAX memory, Matthew had to find an
> replacement for lock_page() to avoid fault vs. truncate races.
> i_mmap_lock was used for that.
> 
> Recently, Matthew had to convert locking to exclusive to address fault
> vs. fault races. And this kills scalability completely.
> 
> The patch below tries to recover some scalability for DAX by introducing
> per-mapping range lock.

So this grows noticeably (3 longs if I'm right) struct address_space and
thus struct inode just for DAX. That looks like a waste but I don't see an
easy solution.

OTOH filesystems in normal mode might want to use the range lock as well to
provide truncate / punch hole vs page fault exclusion (XFS already has a
private rwsem for this and ext4 needs something as well) and at that point
growing generic struct inode would be acceptable for me.

My grand plan was to use the range lock to also simplify locking rules for
read, write and esp. direct IO but that has issues with mmap_sem ordering
because filesystems get called under mmap_sem in page fault path. So
probably just fixing the worst issue with punch hole vs page fault would be
good for now.

Also for a patch set like this, it would be good to show some numbers - how
big hit do you take in the single-threaded case (the lock is more
expensive) and how much scalability do you get in the multithreaded case?

								Honza

> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  fs/dax.c           | 30 +++++++++++++++++-------------
>  fs/inode.c         |  1 +
>  include/linux/fs.h |  2 ++
>  mm/memory.c        | 35 +++++++++++++++++++++++------------
>  mm/rmap.c          |  1 +
>  5 files changed, 44 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index ed54efedade6..27a68eca698e 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -333,6 +333,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	struct inode *inode = mapping->host;
>  	struct page *page;
>  	struct buffer_head bh;
> +	struct range_lock mapping_lock;
>  	unsigned long vaddr = (unsigned long)vmf->virtual_address;
>  	unsigned blkbits = inode->i_blkbits;
>  	sector_t block;
> @@ -348,6 +349,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
>  	bh.b_size = PAGE_SIZE;
>  
> +	/* do_cow_fault() takes the lock */
> +	if (!vmf->cow_page) {
> +		range_lock_init(&mapping_lock, vmf->pgoff, vmf->pgoff);
> +		range_lock(&mapping->mapping_lock, &mapping_lock);
> +	}
>   repeat:
>  	page = find_get_page(mapping, vmf->pgoff);
>  	if (page) {
> @@ -369,8 +375,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			error = -EIO;
>  			goto unlock;
>  		}
> -	} else {
> -		i_mmap_lock_write(mapping);
>  	}
>  
>  	error = get_block(inode, block, &bh, 0);
> @@ -390,8 +394,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			if (error)
>  				goto unlock;
>  		} else {
> -			i_mmap_unlock_write(mapping);
> -			return dax_load_hole(mapping, page, vmf);
> +			error =  dax_load_hole(mapping, page, vmf);
> +			range_unlock(&mapping->mapping_lock, &mapping_lock);
> +			return error;
>  		}
>  	}
>  
> @@ -446,9 +451,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
>  	}
>  
> -	if (!page)
> -		i_mmap_unlock_write(mapping);
>   out:
> +	if (!vmf->cow_page)
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  	if (error == -ENOMEM)
>  		return VM_FAULT_OOM | major;
>  	/* -EBUSY is fine, somebody else faulted on the same PTE */
> @@ -460,10 +465,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	if (page) {
>  		unlock_page(page);
>  		page_cache_release(page);
> -	} else {
> -		i_mmap_unlock_write(mapping);
>  	}
> -
>  	goto out;
>  }
>  EXPORT_SYMBOL(__dax_fault);
> @@ -510,6 +512,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	struct address_space *mapping = file->f_mapping;
>  	struct inode *inode = mapping->host;
>  	struct buffer_head bh;
> +	struct range_lock mapping_lock;
>  	unsigned blkbits = inode->i_blkbits;
>  	unsigned long pmd_addr = address & PMD_MASK;
>  	bool write = flags & FAULT_FLAG_WRITE;
> @@ -541,7 +544,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
>  
>  	bh.b_size = PMD_SIZE;
> -	i_mmap_lock_write(mapping);
> +	range_lock_init(&mapping_lock, pgoff, pgoff + HPAGE_PMD_NR - 1);
> +	range_lock(&mapping->mapping_lock, &mapping_lock);
>  	length = get_block(inode, block, &bh, write);
>  	if (length)
>  		return VM_FAULT_SIGBUS;
> @@ -568,9 +572,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	 * zero pages covering this hole
>  	 */
>  	if (buffer_new(&bh)) {
> -		i_mmap_unlock_write(mapping);
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
> -		i_mmap_lock_write(mapping);
> +		range_lock(&mapping->mapping_lock, &mapping_lock);
>  	}
>  
>  	/*
> @@ -624,7 +628,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	if (buffer_unwritten(&bh))
>  		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
>  
> -	i_mmap_unlock_write(mapping);
> +	range_unlock(&mapping->mapping_lock, &mapping_lock);
>  
>  	return result;
>  
> diff --git a/fs/inode.c b/fs/inode.c
> index e560535706ff..6a24144d679f 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -343,6 +343,7 @@ void address_space_init_once(struct address_space *mapping)
>  	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
>  	spin_lock_init(&mapping->tree_lock);
>  	init_rwsem(&mapping->i_mmap_rwsem);
> +	range_lock_tree_init(&mapping->mapping_lock);
>  	INIT_LIST_HEAD(&mapping->private_list);
>  	spin_lock_init(&mapping->private_lock);
>  	mapping->i_mmap = RB_ROOT;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b6361e2e2a26..368e7208d4f2 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -30,6 +30,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/percpu-rwsem.h>
>  #include <linux/blk_types.h>
> +#include <linux/range_lock.h>
>  
>  #include <asm/byteorder.h>
>  #include <uapi/linux/fs.h>
> @@ -429,6 +430,7 @@ struct address_space {
>  	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
>  	struct rb_root		i_mmap;		/* tree of private and shared mappings */
>  	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
> +	struct range_lock_tree	mapping_lock;	/* lock_page() replacement for DAX */
>  	/* Protected by tree_lock together with the radix tree */
>  	unsigned long		nrpages;	/* number of total pages */
>  	unsigned long		nrshadows;	/* number of shadow entries */
> diff --git a/mm/memory.c b/mm/memory.c
> index 7f6a9563d5a6..b4898db7e4c4 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2409,6 +2409,7 @@ void unmap_mapping_range(struct address_space *mapping,
>  		loff_t const holebegin, loff_t const holelen, int even_cows)
>  {
>  	struct zap_details details;
> +	struct range_lock mapping_lock;
>  	pgoff_t hba = holebegin >> PAGE_SHIFT;
>  	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
>  
> @@ -2426,10 +2427,17 @@ void unmap_mapping_range(struct address_space *mapping,
>  	if (details.last_index < details.first_index)
>  		details.last_index = ULONG_MAX;
>  
> +	if (IS_DAX(mapping->host)) {
> +		/* Exclude fault under us */
> +		range_lock_init(&mapping_lock, hba, hba + hlen - 1);
> +		range_lock(&mapping->mapping_lock, &mapping_lock);
> +	}
>  	i_mmap_lock_write(mapping);
>  	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
>  		unmap_mapping_range_tree(&mapping->i_mmap, &details);
>  	i_mmap_unlock_write(mapping);
> +	if (IS_DAX(mapping->host))
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  }
>  EXPORT_SYMBOL(unmap_mapping_range);
>  
> @@ -2978,6 +2986,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		unsigned long address, pmd_t *pmd,
>  		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
>  {
> +	struct address_space *mapping = vma->vm_file->f_mapping;
> +	struct range_lock mapping_lock;
>  	struct page *fault_page, *new_page;
>  	struct mem_cgroup *memcg;
>  	spinlock_t *ptl;
> @@ -2996,6 +3006,15 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		return VM_FAULT_OOM;
>  	}
>  
> +	if (IS_DAX(file_inode(vma->vm_file))) {
> +		/*
> +		 * The fault handler has no page to lock, so it holds
> +		 * mapping->mapping_lock to protect against truncate.
> +		 */
> +		range_lock_init(&mapping_lock, pgoff, pgoff);
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
> +	}
> +
>  	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
>  		goto uncharge_out;
> @@ -3010,12 +3029,6 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  		if (fault_page) {
>  			unlock_page(fault_page);
>  			page_cache_release(fault_page);
> -		} else {
> -			/*
> -			 * The fault handler has no page to lock, so it holds
> -			 * i_mmap_lock for write to protect against truncate.
> -			 */
> -			i_mmap_unlock_write(vma->vm_file->f_mapping);
>  		}
>  		goto uncharge_out;
>  	}
> @@ -3026,15 +3039,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	if (fault_page) {
>  		unlock_page(fault_page);
>  		page_cache_release(fault_page);
> -	} else {
> -		/*
> -		 * The fault handler has no page to lock, so it holds
> -		 * i_mmap_lock for write to protect against truncate.
> -		 */
> -		i_mmap_unlock_write(vma->vm_file->f_mapping);
>  	}
> +	if (IS_DAX(file_inode(vma->vm_file)))
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  	return ret;
>  uncharge_out:
> +	if (IS_DAX(file_inode(vma->vm_file)))
> +		range_unlock(&mapping->mapping_lock, &mapping_lock);
>  	mem_cgroup_cancel_charge(new_page, memcg);
>  	page_cache_release(new_page);
>  	return ret;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index dcaad464aab0..3d509326d354 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -22,6 +22,7 @@
>   *
>   * inode->i_mutex	(while writing or truncating, not reading or faulting)
>   *   mm->mmap_sem
> + *    mapping->mapping_lock
>   *     page->flags PG_locked (lock_page)
>   *       mapping->i_mmap_rwsem
>   *         anon_vma->rwsem
> -- 
> 2.5.0
>
Dave Chinner Aug. 11, 2015, 9:37 a.m. UTC | #2
On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote:
> On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
> > As we don't have struct pages for DAX memory, Matthew had to find an
> > replacement for lock_page() to avoid fault vs. truncate races.
> > i_mmap_lock was used for that.
> > 
> > Recently, Matthew had to convert locking to exclusive to address fault
> > vs. fault races. And this kills scalability completely.

I'm assuming this locking change is in a for-next git tree branch
somewhere as there isn't anything that I can see in a 4.2-rc6
tree. Can you point me to the git tree that has these changes in it?

> > The patch below tries to recover some scalability for DAX by introducing
> > per-mapping range lock.
> 
> So this grows noticeably (3 longs if I'm right) struct address_space and
> thus struct inode just for DAX. That looks like a waste but I don't see an
> easy solution.
> 
> OTOH filesystems in normal mode might want to use the range lock as well to
> provide truncate / punch hole vs page fault exclusion (XFS already has a
> private rwsem for this and ext4 needs something as well) and at that point
> growing generic struct inode would be acceptable for me.

It sounds to me like the way DAX has tried to solve this race is the
wrong direction. We really need to drive the truncate/page fault
serialisation higher up the stack towards the filesystem, not deeper
into the mm subsystem where locking is greatly limited.

As Jan mentions, we already have this serialisation in XFS, and I
think it would be better first step to replicate that locking model
in each filesystem that is supports DAX. I think this is a better
direction because it moves towards solving a whole class of problems
fileystem face with page fault serialisation, not just for DAX.

> My grand plan was to use the range lock to also simplify locking
> rules for read, write and esp. direct IO but that has issues with
> mmap_sem ordering because filesystems get called under mmap_sem in
> page fault path. So probably just fixing the worst issue with
> punch hole vs page fault would be good for now.

Right, I think adding a rwsem to the ext4 inode to handle the
fault/truncate serialisation similar to XFS would be sufficient to
allow DAX to remove the i_mmap_lock serialisation...

> Also for a patch set like this, it would be good to show some numbers - how
> big hit do you take in the single-threaded case (the lock is more
> expensive) and how much scalability do you get in the multithreaded case?

And also, if you remove the serialisation and run the test on XFS,
what do you get in terms of performance and correctness?

Cheers,

Dave.
Boaz Harrosh Aug. 11, 2015, 11:09 a.m. UTC | #3
On 08/11/2015 12:37 PM, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote:
>> On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
>>> As we don't have struct pages for DAX memory, Matthew had to find an
>>> replacement for lock_page() to avoid fault vs. truncate races.
>>> i_mmap_lock was used for that.
>>>
>>> Recently, Matthew had to convert locking to exclusive to address fault
>>> vs. fault races. And this kills scalability completely.
> 
> I'm assuming this locking change is in a for-next git tree branch
> somewhere as there isn't anything that I can see in a 4.2-rc6
> tree. Can you point me to the git tree that has these changes in it?
> 
>>> The patch below tries to recover some scalability for DAX by introducing
>>> per-mapping range lock.
>>
>> So this grows noticeably (3 longs if I'm right) struct address_space and
>> thus struct inode just for DAX. That looks like a waste but I don't see an
>> easy solution.
>>
>> OTOH filesystems in normal mode might want to use the range lock as well to
>> provide truncate / punch hole vs page fault exclusion (XFS already has a
>> private rwsem for this and ext4 needs something as well) and at that point
>> growing generic struct inode would be acceptable for me.
> 
> It sounds to me like the way DAX has tried to solve this race is the
> wrong direction. We really need to drive the truncate/page fault
> serialisation higher up the stack towards the filesystem, not deeper
> into the mm subsystem where locking is greatly limited.
> 
> As Jan mentions, we already have this serialisation in XFS, and I
> think it would be better first step to replicate that locking model
> in each filesystem that is supports DAX. I think this is a better
> direction because it moves towards solving a whole class of problems
> fileystem face with page fault serialisation, not just for DAX.
> 
>> My grand plan was to use the range lock to also simplify locking
>> rules for read, write and esp. direct IO but that has issues with
>> mmap_sem ordering because filesystems get called under mmap_sem in
>> page fault path. So probably just fixing the worst issue with
>> punch hole vs page fault would be good for now.
> 
> Right, I think adding a rwsem to the ext4 inode to handle the
> fault/truncate serialisation similar to XFS would be sufficient to
> allow DAX to remove the i_mmap_lock serialisation...
> 
>> Also for a patch set like this, it would be good to show some numbers - how
>> big hit do you take in the single-threaded case (the lock is more
>> expensive) and how much scalability do you get in the multithreaded case?
> 
> And also, if you remove the serialisation and run the test on XFS,
> what do you get in terms of performance and correctness?
> 
> Cheers,
> 

Cheers indeed

It is very easy to serialise at the FS level which has control over the
truncate path as well. Is much harder and uglier on the mm side.

Currently there is DAX for xfs, and  ext2, ext4. With xfs's implementation
I think removing the lock completely will just work. If I read the code
correctly.

With ext2/4 you can at worse add the struct range_lock_tree to its private
inode structure and again only if dax is configured. And use that at the
fault wrappers. But I suspect there should be an easier way once at the
ext2/4 code level, and no need for generalization.

I think the xfs case, of "no locks needed at all", calls for removing the
locks at the mm/dax level and moving them up to the FS.

> Dave.
> 

Thanks
Boaz

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill A . Shutemov Aug. 11, 2015, 12:03 p.m. UTC | #4
On Tue, Aug 11, 2015 at 07:37:08PM +1000, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote:
> > On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote:
> > > As we don't have struct pages for DAX memory, Matthew had to find an
> > > replacement for lock_page() to avoid fault vs. truncate races.
> > > i_mmap_lock was used for that.
> > > 
> > > Recently, Matthew had to convert locking to exclusive to address fault
> > > vs. fault races. And this kills scalability completely.
> 
> I'm assuming this locking change is in a for-next git tree branch
> somewhere as there isn't anything that I can see in a 4.2-rc6
> tree. Can you point me to the git tree that has these changes in it?

It's in -mm tree. See e4261a3ed000 in mhocko/mm.git[1]. There are also two
fixups for that commit[2][3]

[1] git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
[2] http://lkml.kernel.org/g/1438948423-128882-1-git-send-email-kirill.shutemov@linux.intel.com
[3] http://lkml.kernel.org/g/1438948482-129043-1-git-send-email-kirill.shutemov@linux.intel.com

> > > The patch below tries to recover some scalability for DAX by introducing
> > > per-mapping range lock.
> > 
> > So this grows noticeably (3 longs if I'm right) struct address_space and
> > thus struct inode just for DAX. That looks like a waste but I don't see an
> > easy solution.

We can try to convert it to pointer instead of embedding it into
struct address_space and make filesystems allocate it on S_DAX setting.
Is it better?

> > OTOH filesystems in normal mode might want to use the range lock as well to
> > provide truncate / punch hole vs page fault exclusion (XFS already has a
> > private rwsem for this and ext4 needs something as well) and at that point
> > growing generic struct inode would be acceptable for me.
> 
> It sounds to me like the way DAX has tried to solve this race is the
> wrong direction. We really need to drive the truncate/page fault
> serialisation higher up the stack towards the filesystem, not deeper
> into the mm subsystem where locking is greatly limited.

My understanding of fs locking is very limited, but I think we have
problem with this approach in fault path: to dispatch page fault properly
we need to take mmap_sem and find VMA. Only after that we have chance to
obtain any fs lock. And that's the place where we take page lock which
this range_lock intend to replace.

I don't see how we can move any lock to serialize truncate vs. fault much
higher.

> As Jan mentions, we already have this serialisation in XFS, and I
> think it would be better first step to replicate that locking model
> in each filesystem that is supports DAX. I think this is a better
> direction because it moves towards solving a whole class of problems
> fileystem face with page fault serialisation, not just for DAX.

Could you point me to that lock and locking rules for it?
 
> > My grand plan was to use the range lock to also simplify locking
> > rules for read, write and esp. direct IO but that has issues with
> > mmap_sem ordering because filesystems get called under mmap_sem in
> > page fault path. So probably just fixing the worst issue with
> > punch hole vs page fault would be good for now.
> 
> Right, I think adding a rwsem to the ext4 inode to handle the
> fault/truncate serialisation similar to XFS would be sufficient to
> allow DAX to remove the i_mmap_lock serialisation...
> 
> > Also for a patch set like this, it would be good to show some numbers - how
> > big hit do you take in the single-threaded case (the lock is more
> > expensive) and how much scalability do you get in the multithreaded case?
> 
> And also, if you remove the serialisation and run the test on XFS,
> what do you get in terms of performance and correctness?

I'll talk with Matthew on numbers.

As we don't have any HW yet, the only numbers we can possibly provide is
DAX over DRAM.
Jan Kara Aug. 11, 2015, 1:50 p.m. UTC | #5
On Tue 11-08-15 19:37:08, Dave Chinner wrote:
> > > The patch below tries to recover some scalability for DAX by introducing
> > > per-mapping range lock.
> > 
> > So this grows noticeably (3 longs if I'm right) struct address_space and
> > thus struct inode just for DAX. That looks like a waste but I don't see an
> > easy solution.
> > 
> > OTOH filesystems in normal mode might want to use the range lock as well to
> > provide truncate / punch hole vs page fault exclusion (XFS already has a
> > private rwsem for this and ext4 needs something as well) and at that point
> > growing generic struct inode would be acceptable for me.
> 
> It sounds to me like the way DAX has tried to solve this race is the
> wrong direction. We really need to drive the truncate/page fault
> serialisation higher up the stack towards the filesystem, not deeper
> into the mm subsystem where locking is greatly limited.
> 
> As Jan mentions, we already have this serialisation in XFS, and I
> think it would be better first step to replicate that locking model
> in each filesystem that is supports DAX. I think this is a better
> direction because it moves towards solving a whole class of problems
> fileystem face with page fault serialisation, not just for DAX.

Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
fault / page_mkwrite callback so it doesn't provide the exclusion necessary
for DAX which needs exclusive access to the page given range in the page
cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
excercise (at least from DAX POV).

So regardless whether the lock will be a fs-private one or in
address_space, DAX needs something like the range lock Kirill suggested.
Having the range lock in fs-private part of inode has the advantage that
only filesystems supporting DAX / punch hole will pay the memory overhead.
OTOH most major filesystems need it so the savings would be IMO noticeable
only for tiny systems using special fs etc. So I'm undecided whether
putting the lock in address_space and doing the locking in generic
pagefault / truncate helpers is a better choice or not.
 
								Honza
Boaz Harrosh Aug. 11, 2015, 2:31 p.m. UTC | #6
On 08/11/2015 04:50 PM, Jan Kara wrote:
> On Tue 11-08-15 19:37:08, Dave Chinner wrote:
>>>> The patch below tries to recover some scalability for DAX by introducing
>>>> per-mapping range lock.
>>>
>>> So this grows noticeably (3 longs if I'm right) struct address_space and
>>> thus struct inode just for DAX. That looks like a waste but I don't see an
>>> easy solution.
>>>
>>> OTOH filesystems in normal mode might want to use the range lock as well to
>>> provide truncate / punch hole vs page fault exclusion (XFS already has a
>>> private rwsem for this and ext4 needs something as well) and at that point
>>> growing generic struct inode would be acceptable for me.
>>
>> It sounds to me like the way DAX has tried to solve this race is the
>> wrong direction. We really need to drive the truncate/page fault
>> serialisation higher up the stack towards the filesystem, not deeper
>> into the mm subsystem where locking is greatly limited.
>>
>> As Jan mentions, we already have this serialisation in XFS, and I
>> think it would be better first step to replicate that locking model
>> in each filesystem that is supports DAX. I think this is a better
>> direction because it moves towards solving a whole class of problems
>> fileystem face with page fault serialisation, not just for DAX.
> 
> Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
> fault / page_mkwrite callback so it doesn't provide the exclusion necessary
> for DAX which needs exclusive access to the page given range in the page
> cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
> excercise (at least from DAX POV).
> 

Hi Jan. So you got me confused above. You say:
	"DAX which needs exclusive access to the page given range in the page cache"

but DAX and page-cache are mutually exclusive. I guess you meant the VMA
range, or the inode->mapping range (which one is it)

Actually I do not understand this race you guys found at all. (Please bear with
me sorry for being slow)

If two threads of the same VMA fault on the same pte
(I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
then the mm knows how to handle this just fine.

If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
wide lock like XFS's to protect against i_size-change / truncate is more than
enough.
Because with DAX there is no inode->mapping "mapping" at all. You have the call
into the FS with get_block() to replace "holes" (zero pages) with real allocated
blocks, on WRITE faults, but this conversion should be protected inside the FS
already. Then there is the atomic exchange of the PTE which is fine.
(And vis versa with holes mapping and writes)

There is no global "mapping" radix-tree shared between VMAs like we are
used to.

Please explain to me the races you are seeing. I would love to also see them
with xfs. I think there they should not happen.

> So regardless whether the lock will be a fs-private one or in
> address_space, DAX needs something like the range lock Kirill suggested.
> Having the range lock in fs-private part of inode has the advantage that
> only filesystems supporting DAX / punch hole will pay the memory overhead.
> OTOH most major filesystems need it so the savings would be IMO noticeable

punch-hole is truncate for me. With the xfs model of read-write lock where
truncate takes write, any fault taking read before executing the fault looks
good for the FS side of things. I guess you mean the optimization of the
radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.

Please explain. Do you have a reproducer of this race. I would need to understand
this.

Thanks
Boaz

> only for tiny systems using special fs etc. So I'm undecided whether
> putting the lock in address_space and doing the locking in generic
> pagefault / truncate helpers is a better choice or not.
>  
> 								Honza
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill A . Shutemov Aug. 11, 2015, 3:28 p.m. UTC | #7
On Tue, Aug 11, 2015 at 05:31:04PM +0300, Boaz Harrosh wrote:
> On 08/11/2015 04:50 PM, Jan Kara wrote:
> > On Tue 11-08-15 19:37:08, Dave Chinner wrote:
> >>>> The patch below tries to recover some scalability for DAX by introducing
> >>>> per-mapping range lock.
> >>>
> >>> So this grows noticeably (3 longs if I'm right) struct address_space and
> >>> thus struct inode just for DAX. That looks like a waste but I don't see an
> >>> easy solution.
> >>>
> >>> OTOH filesystems in normal mode might want to use the range lock as well to
> >>> provide truncate / punch hole vs page fault exclusion (XFS already has a
> >>> private rwsem for this and ext4 needs something as well) and at that point
> >>> growing generic struct inode would be acceptable for me.
> >>
> >> It sounds to me like the way DAX has tried to solve this race is the
> >> wrong direction. We really need to drive the truncate/page fault
> >> serialisation higher up the stack towards the filesystem, not deeper
> >> into the mm subsystem where locking is greatly limited.
> >>
> >> As Jan mentions, we already have this serialisation in XFS, and I
> >> think it would be better first step to replicate that locking model
> >> in each filesystem that is supports DAX. I think this is a better
> >> direction because it moves towards solving a whole class of problems
> >> fileystem face with page fault serialisation, not just for DAX.
> > 
> > Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
> > fault / page_mkwrite callback so it doesn't provide the exclusion necessary
> > for DAX which needs exclusive access to the page given range in the page
> > cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
> > excercise (at least from DAX POV).
> > 
> 
> Hi Jan. So you got me confused above. You say:
> 	"DAX which needs exclusive access to the page given range in the page cache"
> 
> but DAX and page-cache are mutually exclusive. I guess you meant the VMA
> range, or the inode->mapping range (which one is it)

The second -- pgoff range within the inode->mapping.

> Actually I do not understand this race you guys found at all. (Please bear with
> me sorry for being slow)
> 
> If two threads of the same VMA fault on the same pte
> (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
> then the mm knows how to handle this just fine.

It does. But only if we have struct page. See lock_page_or_retry() in
filemap_fault(). Without lock_page() it's problematic.

> If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
> wide lock like XFS's to protect against i_size-change / truncate is more than
> enough.

We also used lock_page() to make sure we shoot out all pages as we don't
exclude page faults during truncate. Consider this race:

	<fault>			<truncate>
	get_block
	check i_size
    				update i_size
				unmap
	setup pte

With normal page cache we make sure that all pages beyond i_size is
dropped using lock_page() in truncate_inode_pages_range().

For DAX we need a way to stop all page faults to the pgoff range before
doing unmap.

> Because with DAX there is no inode->mapping "mapping" at all. You have the call
> into the FS with get_block() to replace "holes" (zero pages) with real allocated
> blocks, on WRITE faults, but this conversion should be protected inside the FS
> already. Then there is the atomic exchange of the PTE which is fine.
> (And vis versa with holes mapping and writes)

Having unmap_mapping_range() in PMD fault handling is very unfortunate.
Go to rmap just to solve page fault is very wrong.
BTW, we need to do it in write path too.

I'm not convinced that all these "let's avoid backing storage allocation"
in DAX code is not layering violation. I think the right place to solve
this is filesystem. And we have almost all required handles for this in
place.  We only need to change vm_ops->page_mkwrite() interface to be able
to return different page than what was given on input.

> > So regardless whether the lock will be a fs-private one or in
> > address_space, DAX needs something like the range lock Kirill suggested.
> > Having the range lock in fs-private part of inode has the advantage that
> > only filesystems supporting DAX / punch hole will pay the memory overhead.
> > OTOH most major filesystems need it so the savings would be IMO noticeable
> 
> punch-hole is truncate for me. With the xfs model of read-write lock where
> truncate takes write, any fault taking read before executing the fault looks
> good for the FS side of things. I guess you mean the optimization of the
> radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.

Hm. Where does XFS take this read-write lock in fault path?

IIUC, truncation vs. page fault serialization relies on i_size being
updated before doing truncate_pagecache() and checking i_size under
page_lock() on fault side. We don't have i_size fence for punch hole.

BTW, how things like ext4_collapse_range() can be safe wrt parallel page
fault? Ted?
Boaz Harrosh Aug. 11, 2015, 4:17 p.m. UTC | #8
On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote:
<>
>> Hi Jan. So you got me confused above. You say:
>> 	"DAX which needs exclusive access to the page given range in the page cache"
>>
>> but DAX and page-cache are mutually exclusive. I guess you meant the VMA
>> range, or the inode->mapping range (which one is it)
> 
> The second -- pgoff range within the inode->mapping.
> 

So yes this is what I do not understand with DAX the inode->mapping radix-tree is
empty.

>> Actually I do not understand this race you guys found at all. (Please bear with
>> me sorry for being slow)
>>
>> If two threads of the same VMA fault on the same pte
>> (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
>> then the mm knows how to handle this just fine.
> 
> It does. But only if we have struct page. See lock_page_or_retry() in
> filemap_fault(). Without lock_page() it's problematic.
> 
>> If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
>> wide lock like XFS's to protect against i_size-change / truncate is more than
>> enough.
> 
> We also used lock_page() to make sure we shoot out all pages as we don't
> exclude page faults during truncate. Consider this race:
> 
> 	<fault>			<truncate>
> 	get_block
> 	check i_size
>     				update i_size
> 				unmap
> 	setup pte
> 

Please consider this senario then:

 	<fault>			<truncate>
	read_lock(inode)

 	get_block
 	check i_size
	
	read_unlock(inode)

				write_lock(inode)

     				update i_size
				* remove allocated blocks
 				unmap

				write_unlock(inode)

 	setup pte

IS what you suppose to do in xfs

> With normal page cache we make sure that all pages beyond i_size is
> dropped using lock_page() in truncate_inode_pages_range().
> 

Yes there is no truncate_inode_pages_range() in DAX again radix tree is
empty.

Please do you have a reproducer I would like to see this race and also
experiment with xfs (I guess you saw it in ext4)

> For DAX we need a way to stop all page faults to the pgoff range before
> doing unmap.
> 

Why ?

>> Because with DAX there is no inode->mapping "mapping" at all. You have the call
>> into the FS with get_block() to replace "holes" (zero pages) with real allocated
>> blocks, on WRITE faults, but this conversion should be protected inside the FS
>> already. Then there is the atomic exchange of the PTE which is fine.
>> (And vis versa with holes mapping and writes)
> 
> Having unmap_mapping_range() in PMD fault handling is very unfortunate.
> Go to rmap just to solve page fault is very wrong.
> BTW, we need to do it in write path too.
> 

Only the write path and only when we exchange a zero-page (hole) with
a new allocated (written to) page. Both write fault and/or write-path

> I'm not convinced that all these "let's avoid backing storage allocation"
> in DAX code is not layering violation. I think the right place to solve
> this is filesystem. And we have almost all required handles for this in
> place.  We only need to change vm_ops->page_mkwrite() interface to be able
> to return different page than what was given on input.
> 

What? there is no page returned for DAX page_mkwrite(), it is all insert_mixed
with direct pmd.

Ha I think I see what you are tumbling on. Maybe it is the zero-pages when
read-mapping holes.

A solution I have, (And is working for a year now) is have only a single
zero-page per inode->mapping, inserted at all the holes. and again radix-tree
is kept clean always. This both saves memory and avoids the race on the
(always empty) radix tree.

Tell me if you want that I send a patch there is a small trick I do
at vm_ops->page_mkwrite():

	/* our zero page doesn't really hold the correct offset to the file in
	 * page->index so vmf->pgoff is incorrect, lets fix that */
	vmf->pgoff = vma->vm_pgoff + (((unsigned long)vmf->virtual_address -
			vma->vm_start) >> PAGE_SHIFT);

	/* call fault handler to get a real page for writing */
	return __page_fault(vma, vmf);

Again the return from page_mkwrite() && __page_fault(WRITE_CASE) is always
VM_FAULT_NOPAGE, right?

>>> So regardless whether the lock will be a fs-private one or in
>>> address_space, DAX needs something like the range lock Kirill suggested.
>>> Having the range lock in fs-private part of inode has the advantage that
>>> only filesystems supporting DAX / punch hole will pay the memory overhead.
>>> OTOH most major filesystems need it so the savings would be IMO noticeable
>>
>> punch-hole is truncate for me. With the xfs model of read-write lock where
>> truncate takes write, any fault taking read before executing the fault looks
>> good for the FS side of things. I guess you mean the optimization of the
>> radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.
> 
> Hm. Where does XFS take this read-write lock in fault path?
> 
> IIUC, truncation vs. page fault serialization relies on i_size being
> updated before doing truncate_pagecache() and checking i_size under
> page_lock() on fault side. We don't have i_size fence for punch hole.
> 

again truncate_pagecache() is NONE.
And yes the read-write locking will protect punch-hole just as truncate
see my locking senario above.

> BTW, how things like ext4_collapse_range() can be safe wrt parallel page
> fault? Ted? 
> 

Thanks
Boaz

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wilcox, Matthew R Aug. 11, 2015, 4:51 p.m. UTC | #9
VGhlIHJhY2UgdGhhdCB5b3UncmUgbm90IHNlZWluZyBpcyBwYWdlIGZhdWx0IHZzIHBhZ2UgZmF1
bHQuICBUd28gdGhyZWFkcyBlYWNoIGF0dGVtcHQgdG8gc3RvcmUgYSBieXRlIHRvIGRpZmZlcmVu
dCBsb2NhdGlvbnMgb24gdGhlIHNhbWUgcGFnZS4gIFdpdGggYSByZWFkLW11dGV4IHRvIGV4Y2x1
ZGUgdHJ1bmNhdGVzLCBlYWNoIHRocmVhZCBjYWxscyAtPmdldF9ibG9jay4gIE9uZSBvZiB0aGUg
dGhyZWFkcyBnZXRzIGJhY2sgYSBidWZmZXIgbWFya2VkIGFzIEJIX05ldyBhbmQgY2FsbHMgbWVt
c2V0KCkgdG8gY2xlYXIgdGhlIHBhZ2UuICBUaGUgb3RoZXIgdGhyZWFkIGdldHMgYmFjayBhIGJ1
ZmZlciB3aGljaCBpc24ndCBtYXJrZWQgYXMgQkhfTmV3IGFuZCBzaW1wbHkgaW5zZXJ0cyB0aGUg
bWFwcGluZywgcmV0dXJuaW5nIHRvIHVzZXJzcGFjZSwgd2hpY2ggc3RvcmVzIHRoZSBieXRlIC4u
LiBqdXN0IGluIHRpbWUgZm9yIHRoZSBvdGhlciB0aHJlYWQncyBtZW1zZXQoKSB0byB3cml0ZSBh
IHplcm8gb3ZlciB0aGUgdG9wIG9mIGl0Lg0KDQpPaCwgYW5kIGlmIGl0J3MgYSBsb2FkIHJhY2lu
ZyBhZ2FpbnN0IGEgc3RvcmUsIHlvdSBjb3VsZCByZWFkIGRhdGEgdGhhdCB1c2VkIHRvIGJlIGlu
IHRoaXMgYmxvY2sgdGhlIGxhc3QgdGltZSBpdCB3YXMgdXNlZDsgbWF5YmUgdGhlIGNvbnRlbnRz
IG9mIC9ldGMvc2hhZG93Lg0KDQpTbyBpZiB3ZSB3YW50IHRvIGJlIGFibGUgdG8gaGFuZGxlIG1v
cmUgdGhhbiBvbmUgcGFnZSBmYXVsdCBhdCBhIHRpbWUgcGVyIGZpbGUgKC4uLiBhbmQgSSB0aGlu
ayB3ZSBkbyAuLi4pLCB3ZSBuZWVkIHRvIGhhdmUgZXhjbHVzaXZlIGFjY2VzcyB0byBhIHJhbmdl
IG9mIHRoZSBmaWxlIHdoaWxlIHdlJ3JlIGNhbGxpbmcgLT5nZXRfYmxvY2soKSwgYW5kIGZvciBh
IGNlcnRhaW4gYW1vdW50IG9mIHRpbWUgYWZ0ZXJ3YXJkcy4gIFdpdGggbm9uLURBWCBmaWxlcywg
dGhpcyBpcyB0aGUgcGFnZSBsb2NrLiAgTXkgb3JpZ2luYWwgcHJvcG9zYWwgZm9yIHNvbHZpbmcg
dGhpcyB3YXMgdG8gZW5oYW5jZSB0aGUgcGFnZSBjYWNoZSByYWRpeCB0cmVlIHRvIGJlIGFibGUg
dG8gbG9jayBzb21ldGhpbmcgb3RoZXIgdGhhbiBhIHBhZ2UsIGJ1dCB0aGF0IGFsc28gaGFzIGNv
bXBsZXhpdGllcy4NCg0KLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCkZyb206IEJvYXogSGFy
cm9zaCBbbWFpbHRvOmJvYXpAcGxleGlzdG9yLmNvbV0gDQpTZW50OiBUdWVzZGF5LCBBdWd1c3Qg
MTEsIDIwMTUgNzozMSBBTQ0KVG86IEphbiBLYXJhOyBEYXZlIENoaW5uZXINCkNjOiBLaXJpbGwg
QS4gU2h1dGVtb3Y7IEFuZHJldyBNb3J0b247IFdpbGNveCwgTWF0dGhldyBSOyBsaW51eC1tbUBr
dmFjay5vcmc7IGxpbnV4LWZzZGV2ZWxAdmdlci5rZXJuZWwub3JnOyBsaW51eC1rZXJuZWxAdmdl
ci5rZXJuZWwub3JnOyBEYXZpZGxvaHIgQnVlc28NClN1YmplY3Q6IFJlOiBbUEFUQ0gsIFJGQyAy
LzJdIGRheDogdXNlIHJhbmdlX2xvY2sgaW5zdGVhZCBvZiBpX21tYXBfbG9jaw0KDQpPbiAwOC8x
MS8yMDE1IDA0OjUwIFBNLCBKYW4gS2FyYSB3cm90ZToNCj4gT24gVHVlIDExLTA4LTE1IDE5OjM3
OjA4LCBEYXZlIENoaW5uZXIgd3JvdGU6DQo+Pj4+IFRoZSBwYXRjaCBiZWxvdyB0cmllcyB0byBy
ZWNvdmVyIHNvbWUgc2NhbGFiaWxpdHkgZm9yIERBWCBieSBpbnRyb2R1Y2luZw0KPj4+PiBwZXIt
bWFwcGluZyByYW5nZSBsb2NrLg0KPj4+DQo+Pj4gU28gdGhpcyBncm93cyBub3RpY2VhYmx5ICgz
IGxvbmdzIGlmIEknbSByaWdodCkgc3RydWN0IGFkZHJlc3Nfc3BhY2UgYW5kDQo+Pj4gdGh1cyBz
dHJ1Y3QgaW5vZGUganVzdCBmb3IgREFYLiBUaGF0IGxvb2tzIGxpa2UgYSB3YXN0ZSBidXQgSSBk
b24ndCBzZWUgYW4NCj4+PiBlYXN5IHNvbHV0aW9uLg0KPj4+DQo+Pj4gT1RPSCBmaWxlc3lzdGVt
cyBpbiBub3JtYWwgbW9kZSBtaWdodCB3YW50IHRvIHVzZSB0aGUgcmFuZ2UgbG9jayBhcyB3ZWxs
IHRvDQo+Pj4gcHJvdmlkZSB0cnVuY2F0ZSAvIHB1bmNoIGhvbGUgdnMgcGFnZSBmYXVsdCBleGNs
dXNpb24gKFhGUyBhbHJlYWR5IGhhcyBhDQo+Pj4gcHJpdmF0ZSByd3NlbSBmb3IgdGhpcyBhbmQg
ZXh0NCBuZWVkcyBzb21ldGhpbmcgYXMgd2VsbCkgYW5kIGF0IHRoYXQgcG9pbnQNCj4+PiBncm93
aW5nIGdlbmVyaWMgc3RydWN0IGlub2RlIHdvdWxkIGJlIGFjY2VwdGFibGUgZm9yIG1lLg0KPj4N
Cj4+IEl0IHNvdW5kcyB0byBtZSBsaWtlIHRoZSB3YXkgREFYIGhhcyB0cmllZCB0byBzb2x2ZSB0
aGlzIHJhY2UgaXMgdGhlDQo+PiB3cm9uZyBkaXJlY3Rpb24uIFdlIHJlYWxseSBuZWVkIHRvIGRy
aXZlIHRoZSB0cnVuY2F0ZS9wYWdlIGZhdWx0DQo+PiBzZXJpYWxpc2F0aW9uIGhpZ2hlciB1cCB0
aGUgc3RhY2sgdG93YXJkcyB0aGUgZmlsZXN5c3RlbSwgbm90IGRlZXBlcg0KPj4gaW50byB0aGUg
bW0gc3Vic3lzdGVtIHdoZXJlIGxvY2tpbmcgaXMgZ3JlYXRseSBsaW1pdGVkLg0KPj4NCj4+IEFz
IEphbiBtZW50aW9ucywgd2UgYWxyZWFkeSBoYXZlIHRoaXMgc2VyaWFsaXNhdGlvbiBpbiBYRlMs
IGFuZCBJDQo+PiB0aGluayBpdCB3b3VsZCBiZSBiZXR0ZXIgZmlyc3Qgc3RlcCB0byByZXBsaWNh
dGUgdGhhdCBsb2NraW5nIG1vZGVsDQo+PiBpbiBlYWNoIGZpbGVzeXN0ZW0gdGhhdCBpcyBzdXBw
b3J0cyBEQVguIEkgdGhpbmsgdGhpcyBpcyBhIGJldHRlcg0KPj4gZGlyZWN0aW9uIGJlY2F1c2Ug
aXQgbW92ZXMgdG93YXJkcyBzb2x2aW5nIGEgd2hvbGUgY2xhc3Mgb2YgcHJvYmxlbXMNCj4+IGZp
bGV5c3RlbSBmYWNlIHdpdGggcGFnZSBmYXVsdCBzZXJpYWxpc2F0aW9uLCBub3QganVzdCBmb3Ig
REFYLg0KPiANCj4gV2VsbCwgYnV0IGF0IGxlYXN0IGluIFhGUyB5b3UgdGFrZSBYRlNfTU1BUExP
Q0sgaW4gc2hhcmVkIG1vZGUgZm9yIHRoZQ0KPiBmYXVsdCAvIHBhZ2VfbWt3cml0ZSBjYWxsYmFj
ayBzbyBpdCBkb2Vzbid0IHByb3ZpZGUgdGhlIGV4Y2x1c2lvbiBuZWNlc3NhcnkNCj4gZm9yIERB
WCB3aGljaCBuZWVkcyBleGNsdXNpdmUgYWNjZXNzIHRvIHRoZSBwYWdlIGdpdmVuIHJhbmdlIGlu
IHRoZSBwYWdlDQo+IGNhY2hlLiBBbmQgcmVwbGFjaW5nIGlfbW1hcF9sb2NrIHdpdGggZnMtcHJp
dmF0ZSBtbWFwIHJ3c2VtIGlzIGEgbW9vdA0KPiBleGNlcmNpc2UgKGF0IGxlYXN0IGZyb20gREFY
IFBPVikuDQo+IA0KDQpIaSBKYW4uIFNvIHlvdSBnb3QgbWUgY29uZnVzZWQgYWJvdmUuIFlvdSBz
YXk6DQoJIkRBWCB3aGljaCBuZWVkcyBleGNsdXNpdmUgYWNjZXNzIHRvIHRoZSBwYWdlIGdpdmVu
IHJhbmdlIGluIHRoZSBwYWdlIGNhY2hlIg0KDQpidXQgREFYIGFuZCBwYWdlLWNhY2hlIGFyZSBt
dXR1YWxseSBleGNsdXNpdmUuIEkgZ3Vlc3MgeW91IG1lYW50IHRoZSBWTUENCnJhbmdlLCBvciB0
aGUgaW5vZGUtPm1hcHBpbmcgcmFuZ2UgKHdoaWNoIG9uZSBpcyBpdCkNCg0KQWN0dWFsbHkgSSBk
byBub3QgdW5kZXJzdGFuZCB0aGlzIHJhY2UgeW91IGd1eXMgZm91bmQgYXQgYWxsLiAoUGxlYXNl
IGJlYXIgd2l0aA0KbWUgc29ycnkgZm9yIGJlaW5nIHNsb3cpDQoNCklmIHR3byB0aHJlYWRzIG9m
IHRoZSBzYW1lIFZNQSBmYXVsdCBvbiB0aGUgc2FtZSBwdGUNCihJJ20gbm90IHN1cmUgaG93IHlv
dSBjYWxsIGl0IEkgbWVhbiBhIHNpbmdsZSA0ayBlbnRyeSBhdCBlYWNoIFZNQXMgcGFnZS10YWJs
ZSkNCnRoZW4gdGhlIG1tIGtub3dzIGhvdyB0byBoYW5kbGUgdGhpcyBqdXN0IGZpbmUuDQoNCklm
IHR3byBwcm9jZXNzZXMsIGllIHR3byBWTUFzIGZhdWx0IG9uIHRoZSBzYW1lIGlub2RlLT5tYXBw
aW5nLiBUaGVuIGFuIGlub2RlDQp3aWRlIGxvY2sgbGlrZSBYRlMncyB0byBwcm90ZWN0IGFnYWlu
c3QgaV9zaXplLWNoYW5nZSAvIHRydW5jYXRlIGlzIG1vcmUgdGhhbg0KZW5vdWdoLg0KQmVjYXVz
ZSB3aXRoIERBWCB0aGVyZSBpcyBubyBpbm9kZS0+bWFwcGluZyAibWFwcGluZyIgYXQgYWxsLiBZ
b3UgaGF2ZSB0aGUgY2FsbA0KaW50byB0aGUgRlMgd2l0aCBnZXRfYmxvY2soKSB0byByZXBsYWNl
ICJob2xlcyIgKHplcm8gcGFnZXMpIHdpdGggcmVhbCBhbGxvY2F0ZWQNCmJsb2Nrcywgb24gV1JJ
VEUgZmF1bHRzLCBidXQgdGhpcyBjb252ZXJzaW9uIHNob3VsZCBiZSBwcm90ZWN0ZWQgaW5zaWRl
IHRoZSBGUw0KYWxyZWFkeS4gVGhlbiB0aGVyZSBpcyB0aGUgYXRvbWljIGV4Y2hhbmdlIG9mIHRo
ZSBQVEUgd2hpY2ggaXMgZmluZS4NCihBbmQgdmlzIHZlcnNhIHdpdGggaG9sZXMgbWFwcGluZyBh
bmQgd3JpdGVzKQ0KDQpUaGVyZSBpcyBubyBnbG9iYWwgIm1hcHBpbmciIHJhZGl4LXRyZWUgc2hh
cmVkIGJldHdlZW4gVk1BcyBsaWtlIHdlIGFyZQ0KdXNlZCB0by4NCg0KUGxlYXNlIGV4cGxhaW4g
dG8gbWUgdGhlIHJhY2VzIHlvdSBhcmUgc2VlaW5nLiBJIHdvdWxkIGxvdmUgdG8gYWxzbyBzZWUg
dGhlbQ0Kd2l0aCB4ZnMuIEkgdGhpbmsgdGhlcmUgdGhleSBzaG91bGQgbm90IGhhcHBlbi4NCg0K
PiBTbyByZWdhcmRsZXNzIHdoZXRoZXIgdGhlIGxvY2sgd2lsbCBiZSBhIGZzLXByaXZhdGUgb25l
IG9yIGluDQo+IGFkZHJlc3Nfc3BhY2UsIERBWCBuZWVkcyBzb21ldGhpbmcgbGlrZSB0aGUgcmFu
Z2UgbG9jayBLaXJpbGwgc3VnZ2VzdGVkLg0KPiBIYXZpbmcgdGhlIHJhbmdlIGxvY2sgaW4gZnMt
cHJpdmF0ZSBwYXJ0IG9mIGlub2RlIGhhcyB0aGUgYWR2YW50YWdlIHRoYXQNCj4gb25seSBmaWxl
c3lzdGVtcyBzdXBwb3J0aW5nIERBWCAvIHB1bmNoIGhvbGUgd2lsbCBwYXkgdGhlIG1lbW9yeSBv
dmVyaGVhZC4NCj4gT1RPSCBtb3N0IG1ham9yIGZpbGVzeXN0ZW1zIG5lZWQgaXQgc28gdGhlIHNh
dmluZ3Mgd291bGQgYmUgSU1PIG5vdGljZWFibGUNCg0KcHVuY2gtaG9sZSBpcyB0cnVuY2F0ZSBm
b3IgbWUuIFdpdGggdGhlIHhmcyBtb2RlbCBvZiByZWFkLXdyaXRlIGxvY2sgd2hlcmUNCnRydW5j
YXRlIHRha2VzIHdyaXRlLCBhbnkgZmF1bHQgdGFraW5nIHJlYWQgYmVmb3JlIGV4ZWN1dGluZyB0
aGUgZmF1bHQgbG9va3MNCmdvb2QgZm9yIHRoZSBGUyBzaWRlIG9mIHRoaW5ncy4gSSBndWVzcyB5
b3UgbWVhbiB0aGUgb3B0aW1pemF0aW9uIG9mIHRoZQ0KcmFkaXgtdHJlZSBsb2NrLiBCdXQgeW91
IHNlZSBEQVggZG9lcyBub3QgaGF2ZSBhIHJhZGl4LXRyZWUsIGllIGl0IGlzIGVtcHR5Lg0KDQpQ
bGVhc2UgZXhwbGFpbi4gRG8geW91IGhhdmUgYSByZXByb2R1Y2VyIG9mIHRoaXMgcmFjZS4gSSB3
b3VsZCBuZWVkIHRvIHVuZGVyc3RhbmQNCnRoaXMuDQoNClRoYW5rcw0KQm9heg0KDQo+IG9ubHkg
Zm9yIHRpbnkgc3lzdGVtcyB1c2luZyBzcGVjaWFsIGZzIGV0Yy4gU28gSSdtIHVuZGVjaWRlZCB3
aGV0aGVyDQo+IHB1dHRpbmcgdGhlIGxvY2sgaW4gYWRkcmVzc19zcGFjZSBhbmQgZG9pbmcgdGhl
IGxvY2tpbmcgaW4gZ2VuZXJpYw0KPiBwYWdlZmF1bHQgLyB0cnVuY2F0ZSBoZWxwZXJzIGlzIGEg
YmV0dGVyIGNob2ljZSBvciBub3QuDQo+ICANCj4gCQkJCQkJCQlIb256YQ0KPiANCg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh Aug. 11, 2015, 6:46 p.m. UTC | #10
On 08/11/2015 07:51 PM, Wilcox, Matthew R wrote:
> The race that you're not seeing is page fault vs page fault. Two
> threads each attempt to store a byte to different locations on the
> same page. With a read-mutex to exclude truncates, each thread calls
> ->get_block. One of the threads gets back a buffer marked as BH_New
> and calls memset() to clear the page. The other thread gets back a
> buffer which isn't marked as BH_New and simply inserts the mapping,
> returning to userspace, which stores the byte ... just in time for
> the other thread's memset() to write a zero over the top of it.
> 

So I guess all you need is to zero it at the FS allocator like a __GFP_ZERO
Perhaps you invent a new flag to pass to ->get_block() like a __GB_FOR_MMAP
that will make the FS zero the block in the case it was newly allocated.
Sure there is some locking going on inside the FS so to not allocate two
blocks for the same inode offset. Then memset_nt() inside or before
that lock.

ie. In the presence of __GB_FOR_MMAP the FS will internally convert any
BH_New to a memset_nt() and regular return.

> Oh, and if it's a load racing against a store, you could read data
> that used to be in this block the last time it was used; maybe the
> contents of /etc/shadow.
> 

Do you have any kind of test like this? I wish we could exercise this
and see that we actually solved it.

> So if we want to be able to handle more than one page fault at a time
> per file (... and I think we do ...), we need to have exclusive
> access to a range of the file while we're calling ->get_block(), and
> for a certain amount of time afterwards. With non-DAX files, this is
> the page lock. My original proposal for solving this was to enhance
> the page cache radix tree to be able to lock something other than a
> page, but that also has complexities.
> 

Yes this could be done with a bit-lock at the RADIX_TREE_EXCEPTIONAL_ENTRY bits
of a radix_tree_exception() entry. These are only used by shmem radix-trees and will
never conflict with DAX

But it means you will need to populate the radix-tree with radix_tree_exception() entries
for every block accessed which will be a great pity.

I still think the above extra flag to ->get_block() taps into filesystem already
existing infra, and will cost much less pain.

(And is much more efficient say in hot path like application already did fallocate
 for performance)

Thanks
Boaz

> -----Original Message-----
> From: Boaz Harrosh [mailto:boaz@plexistor.com] 
> Sent: Tuesday, August 11, 2015 7:31 AM
> To: Jan Kara; Dave Chinner
> Cc: Kirill A. Shutemov; Andrew Morton; Wilcox, Matthew R; linux-mm@kvack.org; linux-fsdevel@vger.kernel.org; linux-kernel@vger.kernel.org; Davidlohr Bueso
> Subject: Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock
> 
> On 08/11/2015 04:50 PM, Jan Kara wrote:
>> On Tue 11-08-15 19:37:08, Dave Chinner wrote:
>>>>> The patch below tries to recover some scalability for DAX by introducing
>>>>> per-mapping range lock.
>>>>
>>>> So this grows noticeably (3 longs if I'm right) struct address_space and
>>>> thus struct inode just for DAX. That looks like a waste but I don't see an
>>>> easy solution.
>>>>
>>>> OTOH filesystems in normal mode might want to use the range lock as well to
>>>> provide truncate / punch hole vs page fault exclusion (XFS already has a
>>>> private rwsem for this and ext4 needs something as well) and at that point
>>>> growing generic struct inode would be acceptable for me.
>>>
>>> It sounds to me like the way DAX has tried to solve this race is the
>>> wrong direction. We really need to drive the truncate/page fault
>>> serialisation higher up the stack towards the filesystem, not deeper
>>> into the mm subsystem where locking is greatly limited.
>>>
>>> As Jan mentions, we already have this serialisation in XFS, and I
>>> think it would be better first step to replicate that locking model
>>> in each filesystem that is supports DAX. I think this is a better
>>> direction because it moves towards solving a whole class of problems
>>> fileystem face with page fault serialisation, not just for DAX.
>>
>> Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the
>> fault / page_mkwrite callback so it doesn't provide the exclusion necessary
>> for DAX which needs exclusive access to the page given range in the page
>> cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot
>> excercise (at least from DAX POV).
>>
> 
> Hi Jan. So you got me confused above. You say:
> 	"DAX which needs exclusive access to the page given range in the page cache"
> 
> but DAX and page-cache are mutually exclusive. I guess you meant the VMA
> range, or the inode->mapping range (which one is it)
> 
> Actually I do not understand this race you guys found at all. (Please bear with
> me sorry for being slow)
> 
> If two threads of the same VMA fault on the same pte
> (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table)
> then the mm knows how to handle this just fine.
> 
> If two processes, ie two VMAs fault on the same inode->mapping. Then an inode
> wide lock like XFS's to protect against i_size-change / truncate is more than
> enough.
> Because with DAX there is no inode->mapping "mapping" at all. You have the call
> into the FS with get_block() to replace "holes" (zero pages) with real allocated
> blocks, on WRITE faults, but this conversion should be protected inside the FS
> already. Then there is the atomic exchange of the PTE which is fine.
> (And vis versa with holes mapping and writes)
> 
> There is no global "mapping" radix-tree shared between VMAs like we are
> used to.
> 
> Please explain to me the races you are seeing. I would love to also see them
> with xfs. I think there they should not happen.
> 
>> So regardless whether the lock will be a fs-private one or in
>> address_space, DAX needs something like the range lock Kirill suggested.
>> Having the range lock in fs-private part of inode has the advantage that
>> only filesystems supporting DAX / punch hole will pay the memory overhead.
>> OTOH most major filesystems need it so the savings would be IMO noticeable
> 
> punch-hole is truncate for me. With the xfs model of read-write lock where
> truncate takes write, any fault taking read before executing the fault looks
> good for the FS side of things. I guess you mean the optimization of the
> radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty.
> 
> Please explain. Do you have a reproducer of this race. I would need to understand
> this.
> 
> Thanks
> Boaz
> 
>> only for tiny systems using special fs etc. So I'm undecided whether
>> putting the lock in address_space and doing the locking in generic
>> pagefault / truncate helpers is a better choice or not.
>>  
>> 								Honza
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirill A . Shutemov Aug. 11, 2015, 8:26 p.m. UTC | #11
On Tue, Aug 11, 2015 at 07:17:12PM +0300, Boaz Harrosh wrote:
> On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote:
> > We also used lock_page() to make sure we shoot out all pages as we don't
> > exclude page faults during truncate. Consider this race:
> > 
> > 	<fault>			<truncate>
> > 	get_block
> > 	check i_size
> >     				update i_size
> > 				unmap
> > 	setup pte
> > 
> 
> Please consider this senario then:
> 
>  	<fault>			<truncate>
> 	read_lock(inode)
> 
>  	get_block
>  	check i_size
> 	
> 	read_unlock(inode)
> 
> 				write_lock(inode)
> 
>      				update i_size
> 				* remove allocated blocks
>  				unmap
> 
> 				write_unlock(inode)
> 
>  	setup pte
> 
> IS what you suppose to do in xfs

Do you realize that you describe a race? :-P

Exactly in this scenario pfn your pte point to is not belong to the file
anymore. Have fun.

> > With normal page cache we make sure that all pages beyond i_size is
> > dropped using lock_page() in truncate_inode_pages_range().
> > 
> 
> Yes there is no truncate_inode_pages_range() in DAX again radix tree is
> empty.
> 
> Please do you have a reproducer I would like to see this race and also
> experiment with xfs (I guess you saw it in ext4)

I don't. And I don't see how race like above can be FS-specific. All
critical points in generic code.
 
> > For DAX we need a way to stop all page faults to the pgoff range before
> > doing unmap.
> > 
> 
> Why ?

Because you can end up with ptes pointing to pfns which fs consider not be
part of the file.

	<truncate>		<fault>
	unmap..
				fault in pfn which unmap already unmapped
	..continue unmap

> >> Because with DAX there is no inode->mapping "mapping" at all. You have the call
> >> into the FS with get_block() to replace "holes" (zero pages) with real allocated
> >> blocks, on WRITE faults, but this conversion should be protected inside the FS
> >> already. Then there is the atomic exchange of the PTE which is fine.
> >> (And vis versa with holes mapping and writes)
> > 
> > Having unmap_mapping_range() in PMD fault handling is very unfortunate.
> > Go to rmap just to solve page fault is very wrong.
> > BTW, we need to do it in write path too.
> > 
> 
> Only the write path and only when we exchange a zero-page (hole) with
> a new allocated (written to) page. Both write fault and/or write-path

No. Always on new BH. We don't have anything (except rmap) to find out if
any other process have zero page for the pgoff.
 
> > I'm not convinced that all these "let's avoid backing storage allocation"
> > in DAX code is not layering violation. I think the right place to solve
> > this is filesystem. And we have almost all required handles for this in
> > place.  We only need to change vm_ops->page_mkwrite() interface to be able
> > to return different page than what was given on input.
> > 
> 
> What? there is no page returned for DAX page_mkwrite(), it is all insert_mixed
> with direct pmd.

That was bogus idea, please ignore.

> > Hm. Where does XFS take this read-write lock in fault path?
> > 
> > IIUC, truncation vs. page fault serialization relies on i_size being
> > updated before doing truncate_pagecache() and checking i_size under
> > page_lock() on fault side. We don't have i_size fence for punch hole.
> > 
> 
> again truncate_pagecache() is NONE.
> And yes the read-write locking will protect punch-hole just as truncate
> see my locking senario above.

Do you mean as racy as your truncate scenario? ;-P
Dave Chinner Aug. 11, 2015, 9:48 p.m. UTC | #12
On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote:
> The race that you're not seeing is page fault vs page fault.  Two
> threads each attempt to store a byte to different locations on the
> same page.  With a read-mutex to exclude truncates, each thread
> calls ->get_block.  One of the threads gets back a buffer marked
> as BH_New and calls memset() to clear the page.  The other thread
> gets back a buffer which isn't marked as BH_New and simply inserts
> the mapping, returning to userspace, which stores the byte ...
> just in time for the other thread's memset() to write a zero over
> the top of it.

So, this is not a truncate race that the XFS MMAPLOCK solves.

However, that doesn't mean that the DAX code needs to add locking to
solve it. The race here is caused by block initialisation being
unserialised after a ->get_block call allocates the block (which the
filesystem serialises via internal locking). Hence two simultaneous
->get_block calls to the same block is guaranteed to have the DAX
block initialisation race with the second ->get_block call that says
the block is already allocated.

IOWs, the way to handle this is to have the ->get_block call handle
the block zeroing for new blocks instead of doing it after the fact
in the generic DAX code where there is no fine-grained serialisation
object available. By calling dax_clear_blocks() in the ->get_block
callback, the filesystem can ensure that the second racing call will
only make progress once the block has been fully initialised by the
first call.

IMO the fix is - again - to move the functionality into the
filesystem where we already have the necessary exclusion in place to
avoid this race condition entirely.

Cheers,

Dave.
Boaz Harrosh Aug. 12, 2015, 7:54 a.m. UTC | #13
On 08/11/2015 11:26 PM, Kirill A. Shutemov wrote:
> On Tue, Aug 11, 2015 at 07:17:12PM +0300, Boaz Harrosh wrote:
>> On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote:
>>> We also used lock_page() to make sure we shoot out all pages as we don't
>>> exclude page faults during truncate. Consider this race:
>>>
>>> 	<fault>			<truncate>
>>> 	get_block
>>> 	check i_size
>>>     				update i_size
>>> 				unmap
>>> 	setup pte
>>>
>>
>> Please consider this senario then:
>>
>>  	<fault>			<truncate>
>> 	read_lock(inode)
>>
>>  	get_block
>>  	check i_size
>> 	
>> 	read_unlock(inode)
>>
>> 				write_lock(inode)
>>
>>      				update i_size
>> 				* remove allocated blocks
>>  				unmap
>>
>> 				write_unlock(inode)
>>
>>  	setup pte
>>
>> IS what you suppose to do in xfs
> 
> Do you realize that you describe a race? :-P
> 
> Exactly in this scenario pfn your pte point to is not belong to the file
> anymore. Have fun.
> 

Sorry yes I have written it wrong, I have now returned to read the actual code
and the setup pte part is also part of the read lock inside the fault handler
before the release of the r_lock.
Da of course it is, it is the page_fault handler that does the
vm_insert_mixed(vma,,pfn) and in the case of concurrent faults the second
call to vm_insert_mixed will return -EBUSY which means all is well.

So the only thing left is the fault-to-fault zero-the-page race as Matthew described
and as Dave and me think we can make this part of the FS's get_block where it is
more natural.

Thanks
Boaz

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh Aug. 12, 2015, 8:51 a.m. UTC | #14
On 08/12/2015 12:48 AM, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote:
>> The race that you're not seeing is page fault vs page fault.  Two
>> threads each attempt to store a byte to different locations on the
>> same page.  With a read-mutex to exclude truncates, each thread
>> calls ->get_block.  One of the threads gets back a buffer marked
>> as BH_New and calls memset() to clear the page.  The other thread
>> gets back a buffer which isn't marked as BH_New and simply inserts
>> the mapping, returning to userspace, which stores the byte ...
>> just in time for the other thread's memset() to write a zero over
>> the top of it.
> 
> So, this is not a truncate race that the XFS MMAPLOCK solves.
> 
> However, that doesn't mean that the DAX code needs to add locking to
> solve it. The race here is caused by block initialisation being
> unserialised after a ->get_block call allocates the block (which the
> filesystem serialises via internal locking). Hence two simultaneous
> ->get_block calls to the same block is guaranteed to have the DAX
> block initialisation race with the second ->get_block call that says
> the block is already allocated.
> 
> IOWs, the way to handle this is to have the ->get_block call handle
> the block zeroing for new blocks instead of doing it after the fact
> in the generic DAX code where there is no fine-grained serialisation
> object available. By calling dax_clear_blocks() in the ->get_block
> callback, the filesystem can ensure that the second racing call will
> only make progress once the block has been fully initialised by the
> first call.
> 
> IMO the fix is - again - to move the functionality into the
> filesystem where we already have the necessary exclusion in place to
> avoid this race condition entirely.
> 

Exactly, thanks

> Cheers,
> 
> Dave.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Aug. 13, 2015, 11:30 a.m. UTC | #15
On Wed 12-08-15 07:48:22, Dave Chinner wrote:
> On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote:
> > The race that you're not seeing is page fault vs page fault.  Two
> > threads each attempt to store a byte to different locations on the
> > same page.  With a read-mutex to exclude truncates, each thread
> > calls ->get_block.  One of the threads gets back a buffer marked
> > as BH_New and calls memset() to clear the page.  The other thread
> > gets back a buffer which isn't marked as BH_New and simply inserts
> > the mapping, returning to userspace, which stores the byte ...
> > just in time for the other thread's memset() to write a zero over
> > the top of it.
> 
> So, this is not a truncate race that the XFS MMAPLOCK solves.
> 
> However, that doesn't mean that the DAX code needs to add locking to
> solve it. The race here is caused by block initialisation being
> unserialised after a ->get_block call allocates the block (which the
> filesystem serialises via internal locking). Hence two simultaneous
> ->get_block calls to the same block is guaranteed to have the DAX
> block initialisation race with the second ->get_block call that says
> the block is already allocated.
> 
> IOWs, the way to handle this is to have the ->get_block call handle
> the block zeroing for new blocks instead of doing it after the fact
> in the generic DAX code where there is no fine-grained serialisation
> object available. By calling dax_clear_blocks() in the ->get_block
> callback, the filesystem can ensure that the second racing call will
> only make progress once the block has been fully initialised by the
> first call.
> 
> IMO the fix is - again - to move the functionality into the
> filesystem where we already have the necessary exclusion in place to
> avoid this race condition entirely.

I'm somewhat sad to add even more functionality into the already loaded
block mapping interface - we can already allocate delalloc blocks, unwritten
blocks, uninitialized blocks, and now also pre-zeroed blocks. But I agree
fs already synchronizes block allocation for a given inode so adding the
pre-zeroing there is pretty easy. Also getting rid of unwritten extent
handling from DAX code is a nice bonus so all in all I'm for this approach.

								Honza
diff mbox

Patch

diff --git a/fs/dax.c b/fs/dax.c
index ed54efedade6..27a68eca698e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -333,6 +333,7 @@  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head bh;
+	struct range_lock mapping_lock;
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
 	unsigned blkbits = inode->i_blkbits;
 	sector_t block;
@@ -348,6 +349,11 @@  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
 	bh.b_size = PAGE_SIZE;
 
+	/* do_cow_fault() takes the lock */
+	if (!vmf->cow_page) {
+		range_lock_init(&mapping_lock, vmf->pgoff, vmf->pgoff);
+		range_lock(&mapping->mapping_lock, &mapping_lock);
+	}
  repeat:
 	page = find_get_page(mapping, vmf->pgoff);
 	if (page) {
@@ -369,8 +375,6 @@  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			error = -EIO;
 			goto unlock;
 		}
-	} else {
-		i_mmap_lock_write(mapping);
 	}
 
 	error = get_block(inode, block, &bh, 0);
@@ -390,8 +394,9 @@  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			if (error)
 				goto unlock;
 		} else {
-			i_mmap_unlock_write(mapping);
-			return dax_load_hole(mapping, page, vmf);
+			error =  dax_load_hole(mapping, page, vmf);
+			range_unlock(&mapping->mapping_lock, &mapping_lock);
+			return error;
 		}
 	}
 
@@ -446,9 +451,9 @@  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
 	}
 
-	if (!page)
-		i_mmap_unlock_write(mapping);
  out:
+	if (!vmf->cow_page)
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
 	/* -EBUSY is fine, somebody else faulted on the same PTE */
@@ -460,10 +465,7 @@  int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
-	} else {
-		i_mmap_unlock_write(mapping);
 	}
-
 	goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
@@ -510,6 +512,7 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	struct buffer_head bh;
+	struct range_lock mapping_lock;
 	unsigned blkbits = inode->i_blkbits;
 	unsigned long pmd_addr = address & PMD_MASK;
 	bool write = flags & FAULT_FLAG_WRITE;
@@ -541,7 +544,8 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
-	i_mmap_lock_write(mapping);
+	range_lock_init(&mapping_lock, pgoff, pgoff + HPAGE_PMD_NR - 1);
+	range_lock(&mapping->mapping_lock, &mapping_lock);
 	length = get_block(inode, block, &bh, write);
 	if (length)
 		return VM_FAULT_SIGBUS;
@@ -568,9 +572,9 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * zero pages covering this hole
 	 */
 	if (buffer_new(&bh)) {
-		i_mmap_unlock_write(mapping);
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-		i_mmap_lock_write(mapping);
+		range_lock(&mapping->mapping_lock, &mapping_lock);
 	}
 
 	/*
@@ -624,7 +628,7 @@  int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (buffer_unwritten(&bh))
 		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
 
-	i_mmap_unlock_write(mapping);
+	range_unlock(&mapping->mapping_lock, &mapping_lock);
 
 	return result;
 
diff --git a/fs/inode.c b/fs/inode.c
index e560535706ff..6a24144d679f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -343,6 +343,7 @@  void address_space_init_once(struct address_space *mapping)
 	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
 	spin_lock_init(&mapping->tree_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
+	range_lock_tree_init(&mapping->mapping_lock);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b6361e2e2a26..368e7208d4f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -30,6 +30,7 @@ 
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/blk_types.h>
+#include <linux/range_lock.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -429,6 +430,7 @@  struct address_space {
 	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
+	struct range_lock_tree	mapping_lock;	/* lock_page() replacement for DAX */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
 	unsigned long		nrshadows;	/* number of shadow entries */
diff --git a/mm/memory.c b/mm/memory.c
index 7f6a9563d5a6..b4898db7e4c4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2409,6 +2409,7 @@  void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows)
 {
 	struct zap_details details;
+	struct range_lock mapping_lock;
 	pgoff_t hba = holebegin >> PAGE_SHIFT;
 	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
@@ -2426,10 +2427,17 @@  void unmap_mapping_range(struct address_space *mapping,
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 
+	if (IS_DAX(mapping->host)) {
+		/* Exclude fault under us */
+		range_lock_init(&mapping_lock, hba, hba + hlen - 1);
+		range_lock(&mapping->mapping_lock, &mapping_lock);
+	}
 	i_mmap_lock_write(mapping);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	i_mmap_unlock_write(mapping);
+	if (IS_DAX(mapping->host))
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
@@ -2978,6 +2986,8 @@  static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct range_lock mapping_lock;
 	struct page *fault_page, *new_page;
 	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
@@ -2996,6 +3006,15 @@  static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
+	if (IS_DAX(file_inode(vma->vm_file))) {
+		/*
+		 * The fault handler has no page to lock, so it holds
+		 * mapping->mapping_lock to protect against truncate.
+		 */
+		range_lock_init(&mapping_lock, pgoff, pgoff);
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
+	}
+
 	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
@@ -3010,12 +3029,6 @@  static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (fault_page) {
 			unlock_page(fault_page);
 			page_cache_release(fault_page);
-		} else {
-			/*
-			 * The fault handler has no page to lock, so it holds
-			 * i_mmap_lock for write to protect against truncate.
-			 */
-			i_mmap_unlock_write(vma->vm_file->f_mapping);
 		}
 		goto uncharge_out;
 	}
@@ -3026,15 +3039,13 @@  static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (fault_page) {
 		unlock_page(fault_page);
 		page_cache_release(fault_page);
-	} else {
-		/*
-		 * The fault handler has no page to lock, so it holds
-		 * i_mmap_lock for write to protect against truncate.
-		 */
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
 	}
+	if (IS_DAX(file_inode(vma->vm_file)))
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 	return ret;
 uncharge_out:
+	if (IS_DAX(file_inode(vma->vm_file)))
+		range_unlock(&mapping->mapping_lock, &mapping_lock);
 	mem_cgroup_cancel_charge(new_page, memcg);
 	page_cache_release(new_page);
 	return ret;
diff --git a/mm/rmap.c b/mm/rmap.c
index dcaad464aab0..3d509326d354 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -22,6 +22,7 @@ 
  *
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   mm->mmap_sem
+ *    mapping->mapping_lock
  *     page->flags PG_locked (lock_page)
  *       mapping->i_mmap_rwsem
  *         anon_vma->rwsem