Message ID | 1439219664-88088-3-git-send-email-kirill.shutemov@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote: > As we don't have struct pages for DAX memory, Matthew had to find an > replacement for lock_page() to avoid fault vs. truncate races. > i_mmap_lock was used for that. > > Recently, Matthew had to convert locking to exclusive to address fault > vs. fault races. And this kills scalability completely. > > The patch below tries to recover some scalability for DAX by introducing > per-mapping range lock. So this grows noticeably (3 longs if I'm right) struct address_space and thus struct inode just for DAX. That looks like a waste but I don't see an easy solution. OTOH filesystems in normal mode might want to use the range lock as well to provide truncate / punch hole vs page fault exclusion (XFS already has a private rwsem for this and ext4 needs something as well) and at that point growing generic struct inode would be acceptable for me. My grand plan was to use the range lock to also simplify locking rules for read, write and esp. direct IO but that has issues with mmap_sem ordering because filesystems get called under mmap_sem in page fault path. So probably just fixing the worst issue with punch hole vs page fault would be good for now. Also for a patch set like this, it would be good to show some numbers - how big hit do you take in the single-threaded case (the lock is more expensive) and how much scalability do you get in the multithreaded case? Honza > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > --- > fs/dax.c | 30 +++++++++++++++++------------- > fs/inode.c | 1 + > include/linux/fs.h | 2 ++ > mm/memory.c | 35 +++++++++++++++++++++++------------ > mm/rmap.c | 1 + > 5 files changed, 44 insertions(+), 25 deletions(-) > > diff --git a/fs/dax.c b/fs/dax.c > index ed54efedade6..27a68eca698e 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -333,6 +333,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > struct inode *inode = mapping->host; > struct page *page; > struct buffer_head bh; > + struct range_lock mapping_lock; > unsigned long vaddr = (unsigned long)vmf->virtual_address; > unsigned blkbits = inode->i_blkbits; > sector_t block; > @@ -348,6 +349,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); > bh.b_size = PAGE_SIZE; > > + /* do_cow_fault() takes the lock */ > + if (!vmf->cow_page) { > + range_lock_init(&mapping_lock, vmf->pgoff, vmf->pgoff); > + range_lock(&mapping->mapping_lock, &mapping_lock); > + } > repeat: > page = find_get_page(mapping, vmf->pgoff); > if (page) { > @@ -369,8 +375,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > error = -EIO; > goto unlock; > } > - } else { > - i_mmap_lock_write(mapping); > } > > error = get_block(inode, block, &bh, 0); > @@ -390,8 +394,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > if (error) > goto unlock; > } else { > - i_mmap_unlock_write(mapping); > - return dax_load_hole(mapping, page, vmf); > + error = dax_load_hole(mapping, page, vmf); > + range_unlock(&mapping->mapping_lock, &mapping_lock); > + return error; > } > } > > @@ -446,9 +451,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); > } > > - if (!page) > - i_mmap_unlock_write(mapping); > out: > + if (!vmf->cow_page) > + range_unlock(&mapping->mapping_lock, &mapping_lock); > if (error == -ENOMEM) > return VM_FAULT_OOM | major; > /* -EBUSY is fine, somebody else faulted on the same PTE */ > @@ -460,10 +465,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > if (page) { > unlock_page(page); > page_cache_release(page); > - } else { > - i_mmap_unlock_write(mapping); > } > - > goto out; > } > EXPORT_SYMBOL(__dax_fault); > @@ -510,6 +512,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, > struct address_space *mapping = file->f_mapping; > struct inode *inode = mapping->host; > struct buffer_head bh; > + struct range_lock mapping_lock; > unsigned blkbits = inode->i_blkbits; > unsigned long pmd_addr = address & PMD_MASK; > bool write = flags & FAULT_FLAG_WRITE; > @@ -541,7 +544,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, > block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); > > bh.b_size = PMD_SIZE; > - i_mmap_lock_write(mapping); > + range_lock_init(&mapping_lock, pgoff, pgoff + HPAGE_PMD_NR - 1); > + range_lock(&mapping->mapping_lock, &mapping_lock); > length = get_block(inode, block, &bh, write); > if (length) > return VM_FAULT_SIGBUS; > @@ -568,9 +572,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, > * zero pages covering this hole > */ > if (buffer_new(&bh)) { > - i_mmap_unlock_write(mapping); > + range_unlock(&mapping->mapping_lock, &mapping_lock); > unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); > - i_mmap_lock_write(mapping); > + range_lock(&mapping->mapping_lock, &mapping_lock); > } > > /* > @@ -624,7 +628,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, > if (buffer_unwritten(&bh)) > complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); > > - i_mmap_unlock_write(mapping); > + range_unlock(&mapping->mapping_lock, &mapping_lock); > > return result; > > diff --git a/fs/inode.c b/fs/inode.c > index e560535706ff..6a24144d679f 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -343,6 +343,7 @@ void address_space_init_once(struct address_space *mapping) > INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); > spin_lock_init(&mapping->tree_lock); > init_rwsem(&mapping->i_mmap_rwsem); > + range_lock_tree_init(&mapping->mapping_lock); > INIT_LIST_HEAD(&mapping->private_list); > spin_lock_init(&mapping->private_lock); > mapping->i_mmap = RB_ROOT; > diff --git a/include/linux/fs.h b/include/linux/fs.h > index b6361e2e2a26..368e7208d4f2 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -30,6 +30,7 @@ > #include <linux/lockdep.h> > #include <linux/percpu-rwsem.h> > #include <linux/blk_types.h> > +#include <linux/range_lock.h> > > #include <asm/byteorder.h> > #include <uapi/linux/fs.h> > @@ -429,6 +430,7 @@ struct address_space { > atomic_t i_mmap_writable;/* count VM_SHARED mappings */ > struct rb_root i_mmap; /* tree of private and shared mappings */ > struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ > + struct range_lock_tree mapping_lock; /* lock_page() replacement for DAX */ > /* Protected by tree_lock together with the radix tree */ > unsigned long nrpages; /* number of total pages */ > unsigned long nrshadows; /* number of shadow entries */ > diff --git a/mm/memory.c b/mm/memory.c > index 7f6a9563d5a6..b4898db7e4c4 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2409,6 +2409,7 @@ void unmap_mapping_range(struct address_space *mapping, > loff_t const holebegin, loff_t const holelen, int even_cows) > { > struct zap_details details; > + struct range_lock mapping_lock; > pgoff_t hba = holebegin >> PAGE_SHIFT; > pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; > > @@ -2426,10 +2427,17 @@ void unmap_mapping_range(struct address_space *mapping, > if (details.last_index < details.first_index) > details.last_index = ULONG_MAX; > > + if (IS_DAX(mapping->host)) { > + /* Exclude fault under us */ > + range_lock_init(&mapping_lock, hba, hba + hlen - 1); > + range_lock(&mapping->mapping_lock, &mapping_lock); > + } > i_mmap_lock_write(mapping); > if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) > unmap_mapping_range_tree(&mapping->i_mmap, &details); > i_mmap_unlock_write(mapping); > + if (IS_DAX(mapping->host)) > + range_unlock(&mapping->mapping_lock, &mapping_lock); > } > EXPORT_SYMBOL(unmap_mapping_range); > > @@ -2978,6 +2986,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, > unsigned long address, pmd_t *pmd, > pgoff_t pgoff, unsigned int flags, pte_t orig_pte) > { > + struct address_space *mapping = vma->vm_file->f_mapping; > + struct range_lock mapping_lock; > struct page *fault_page, *new_page; > struct mem_cgroup *memcg; > spinlock_t *ptl; > @@ -2996,6 +3006,15 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, > return VM_FAULT_OOM; > } > > + if (IS_DAX(file_inode(vma->vm_file))) { > + /* > + * The fault handler has no page to lock, so it holds > + * mapping->mapping_lock to protect against truncate. > + */ > + range_lock_init(&mapping_lock, pgoff, pgoff); > + range_unlock(&mapping->mapping_lock, &mapping_lock); > + } > + > ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); > if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) > goto uncharge_out; > @@ -3010,12 +3029,6 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, > if (fault_page) { > unlock_page(fault_page); > page_cache_release(fault_page); > - } else { > - /* > - * The fault handler has no page to lock, so it holds > - * i_mmap_lock for write to protect against truncate. > - */ > - i_mmap_unlock_write(vma->vm_file->f_mapping); > } > goto uncharge_out; > } > @@ -3026,15 +3039,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, > if (fault_page) { > unlock_page(fault_page); > page_cache_release(fault_page); > - } else { > - /* > - * The fault handler has no page to lock, so it holds > - * i_mmap_lock for write to protect against truncate. > - */ > - i_mmap_unlock_write(vma->vm_file->f_mapping); > } > + if (IS_DAX(file_inode(vma->vm_file))) > + range_unlock(&mapping->mapping_lock, &mapping_lock); > return ret; > uncharge_out: > + if (IS_DAX(file_inode(vma->vm_file))) > + range_unlock(&mapping->mapping_lock, &mapping_lock); > mem_cgroup_cancel_charge(new_page, memcg); > page_cache_release(new_page); > return ret; > diff --git a/mm/rmap.c b/mm/rmap.c > index dcaad464aab0..3d509326d354 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -22,6 +22,7 @@ > * > * inode->i_mutex (while writing or truncating, not reading or faulting) > * mm->mmap_sem > + * mapping->mapping_lock > * page->flags PG_locked (lock_page) > * mapping->i_mmap_rwsem > * anon_vma->rwsem > -- > 2.5.0 >
On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote: > On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote: > > As we don't have struct pages for DAX memory, Matthew had to find an > > replacement for lock_page() to avoid fault vs. truncate races. > > i_mmap_lock was used for that. > > > > Recently, Matthew had to convert locking to exclusive to address fault > > vs. fault races. And this kills scalability completely. I'm assuming this locking change is in a for-next git tree branch somewhere as there isn't anything that I can see in a 4.2-rc6 tree. Can you point me to the git tree that has these changes in it? > > The patch below tries to recover some scalability for DAX by introducing > > per-mapping range lock. > > So this grows noticeably (3 longs if I'm right) struct address_space and > thus struct inode just for DAX. That looks like a waste but I don't see an > easy solution. > > OTOH filesystems in normal mode might want to use the range lock as well to > provide truncate / punch hole vs page fault exclusion (XFS already has a > private rwsem for this and ext4 needs something as well) and at that point > growing generic struct inode would be acceptable for me. It sounds to me like the way DAX has tried to solve this race is the wrong direction. We really need to drive the truncate/page fault serialisation higher up the stack towards the filesystem, not deeper into the mm subsystem where locking is greatly limited. As Jan mentions, we already have this serialisation in XFS, and I think it would be better first step to replicate that locking model in each filesystem that is supports DAX. I think this is a better direction because it moves towards solving a whole class of problems fileystem face with page fault serialisation, not just for DAX. > My grand plan was to use the range lock to also simplify locking > rules for read, write and esp. direct IO but that has issues with > mmap_sem ordering because filesystems get called under mmap_sem in > page fault path. So probably just fixing the worst issue with > punch hole vs page fault would be good for now. Right, I think adding a rwsem to the ext4 inode to handle the fault/truncate serialisation similar to XFS would be sufficient to allow DAX to remove the i_mmap_lock serialisation... > Also for a patch set like this, it would be good to show some numbers - how > big hit do you take in the single-threaded case (the lock is more > expensive) and how much scalability do you get in the multithreaded case? And also, if you remove the serialisation and run the test on XFS, what do you get in terms of performance and correctness? Cheers, Dave.
On 08/11/2015 12:37 PM, Dave Chinner wrote: > On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote: >> On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote: >>> As we don't have struct pages for DAX memory, Matthew had to find an >>> replacement for lock_page() to avoid fault vs. truncate races. >>> i_mmap_lock was used for that. >>> >>> Recently, Matthew had to convert locking to exclusive to address fault >>> vs. fault races. And this kills scalability completely. > > I'm assuming this locking change is in a for-next git tree branch > somewhere as there isn't anything that I can see in a 4.2-rc6 > tree. Can you point me to the git tree that has these changes in it? > >>> The patch below tries to recover some scalability for DAX by introducing >>> per-mapping range lock. >> >> So this grows noticeably (3 longs if I'm right) struct address_space and >> thus struct inode just for DAX. That looks like a waste but I don't see an >> easy solution. >> >> OTOH filesystems in normal mode might want to use the range lock as well to >> provide truncate / punch hole vs page fault exclusion (XFS already has a >> private rwsem for this and ext4 needs something as well) and at that point >> growing generic struct inode would be acceptable for me. > > It sounds to me like the way DAX has tried to solve this race is the > wrong direction. We really need to drive the truncate/page fault > serialisation higher up the stack towards the filesystem, not deeper > into the mm subsystem where locking is greatly limited. > > As Jan mentions, we already have this serialisation in XFS, and I > think it would be better first step to replicate that locking model > in each filesystem that is supports DAX. I think this is a better > direction because it moves towards solving a whole class of problems > fileystem face with page fault serialisation, not just for DAX. > >> My grand plan was to use the range lock to also simplify locking >> rules for read, write and esp. direct IO but that has issues with >> mmap_sem ordering because filesystems get called under mmap_sem in >> page fault path. So probably just fixing the worst issue with >> punch hole vs page fault would be good for now. > > Right, I think adding a rwsem to the ext4 inode to handle the > fault/truncate serialisation similar to XFS would be sufficient to > allow DAX to remove the i_mmap_lock serialisation... > >> Also for a patch set like this, it would be good to show some numbers - how >> big hit do you take in the single-threaded case (the lock is more >> expensive) and how much scalability do you get in the multithreaded case? > > And also, if you remove the serialisation and run the test on XFS, > what do you get in terms of performance and correctness? > > Cheers, > Cheers indeed It is very easy to serialise at the FS level which has control over the truncate path as well. Is much harder and uglier on the mm side. Currently there is DAX for xfs, and ext2, ext4. With xfs's implementation I think removing the lock completely will just work. If I read the code correctly. With ext2/4 you can at worse add the struct range_lock_tree to its private inode structure and again only if dax is configured. And use that at the fault wrappers. But I suspect there should be an easier way once at the ext2/4 code level, and no need for generalization. I think the xfs case, of "no locks needed at all", calls for removing the locks at the mm/dax level and moving them up to the FS. > Dave. > Thanks Boaz -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Aug 11, 2015 at 07:37:08PM +1000, Dave Chinner wrote: > On Tue, Aug 11, 2015 at 10:19:09AM +0200, Jan Kara wrote: > > On Mon 10-08-15 18:14:24, Kirill A. Shutemov wrote: > > > As we don't have struct pages for DAX memory, Matthew had to find an > > > replacement for lock_page() to avoid fault vs. truncate races. > > > i_mmap_lock was used for that. > > > > > > Recently, Matthew had to convert locking to exclusive to address fault > > > vs. fault races. And this kills scalability completely. > > I'm assuming this locking change is in a for-next git tree branch > somewhere as there isn't anything that I can see in a 4.2-rc6 > tree. Can you point me to the git tree that has these changes in it? It's in -mm tree. See e4261a3ed000 in mhocko/mm.git[1]. There are also two fixups for that commit[2][3] [1] git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git [2] http://lkml.kernel.org/g/1438948423-128882-1-git-send-email-kirill.shutemov@linux.intel.com [3] http://lkml.kernel.org/g/1438948482-129043-1-git-send-email-kirill.shutemov@linux.intel.com > > > The patch below tries to recover some scalability for DAX by introducing > > > per-mapping range lock. > > > > So this grows noticeably (3 longs if I'm right) struct address_space and > > thus struct inode just for DAX. That looks like a waste but I don't see an > > easy solution. We can try to convert it to pointer instead of embedding it into struct address_space and make filesystems allocate it on S_DAX setting. Is it better? > > OTOH filesystems in normal mode might want to use the range lock as well to > > provide truncate / punch hole vs page fault exclusion (XFS already has a > > private rwsem for this and ext4 needs something as well) and at that point > > growing generic struct inode would be acceptable for me. > > It sounds to me like the way DAX has tried to solve this race is the > wrong direction. We really need to drive the truncate/page fault > serialisation higher up the stack towards the filesystem, not deeper > into the mm subsystem where locking is greatly limited. My understanding of fs locking is very limited, but I think we have problem with this approach in fault path: to dispatch page fault properly we need to take mmap_sem and find VMA. Only after that we have chance to obtain any fs lock. And that's the place where we take page lock which this range_lock intend to replace. I don't see how we can move any lock to serialize truncate vs. fault much higher. > As Jan mentions, we already have this serialisation in XFS, and I > think it would be better first step to replicate that locking model > in each filesystem that is supports DAX. I think this is a better > direction because it moves towards solving a whole class of problems > fileystem face with page fault serialisation, not just for DAX. Could you point me to that lock and locking rules for it? > > My grand plan was to use the range lock to also simplify locking > > rules for read, write and esp. direct IO but that has issues with > > mmap_sem ordering because filesystems get called under mmap_sem in > > page fault path. So probably just fixing the worst issue with > > punch hole vs page fault would be good for now. > > Right, I think adding a rwsem to the ext4 inode to handle the > fault/truncate serialisation similar to XFS would be sufficient to > allow DAX to remove the i_mmap_lock serialisation... > > > Also for a patch set like this, it would be good to show some numbers - how > > big hit do you take in the single-threaded case (the lock is more > > expensive) and how much scalability do you get in the multithreaded case? > > And also, if you remove the serialisation and run the test on XFS, > what do you get in terms of performance and correctness? I'll talk with Matthew on numbers. As we don't have any HW yet, the only numbers we can possibly provide is DAX over DRAM.
On Tue 11-08-15 19:37:08, Dave Chinner wrote: > > > The patch below tries to recover some scalability for DAX by introducing > > > per-mapping range lock. > > > > So this grows noticeably (3 longs if I'm right) struct address_space and > > thus struct inode just for DAX. That looks like a waste but I don't see an > > easy solution. > > > > OTOH filesystems in normal mode might want to use the range lock as well to > > provide truncate / punch hole vs page fault exclusion (XFS already has a > > private rwsem for this and ext4 needs something as well) and at that point > > growing generic struct inode would be acceptable for me. > > It sounds to me like the way DAX has tried to solve this race is the > wrong direction. We really need to drive the truncate/page fault > serialisation higher up the stack towards the filesystem, not deeper > into the mm subsystem where locking is greatly limited. > > As Jan mentions, we already have this serialisation in XFS, and I > think it would be better first step to replicate that locking model > in each filesystem that is supports DAX. I think this is a better > direction because it moves towards solving a whole class of problems > fileystem face with page fault serialisation, not just for DAX. Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the fault / page_mkwrite callback so it doesn't provide the exclusion necessary for DAX which needs exclusive access to the page given range in the page cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot excercise (at least from DAX POV). So regardless whether the lock will be a fs-private one or in address_space, DAX needs something like the range lock Kirill suggested. Having the range lock in fs-private part of inode has the advantage that only filesystems supporting DAX / punch hole will pay the memory overhead. OTOH most major filesystems need it so the savings would be IMO noticeable only for tiny systems using special fs etc. So I'm undecided whether putting the lock in address_space and doing the locking in generic pagefault / truncate helpers is a better choice or not. Honza
On 08/11/2015 04:50 PM, Jan Kara wrote: > On Tue 11-08-15 19:37:08, Dave Chinner wrote: >>>> The patch below tries to recover some scalability for DAX by introducing >>>> per-mapping range lock. >>> >>> So this grows noticeably (3 longs if I'm right) struct address_space and >>> thus struct inode just for DAX. That looks like a waste but I don't see an >>> easy solution. >>> >>> OTOH filesystems in normal mode might want to use the range lock as well to >>> provide truncate / punch hole vs page fault exclusion (XFS already has a >>> private rwsem for this and ext4 needs something as well) and at that point >>> growing generic struct inode would be acceptable for me. >> >> It sounds to me like the way DAX has tried to solve this race is the >> wrong direction. We really need to drive the truncate/page fault >> serialisation higher up the stack towards the filesystem, not deeper >> into the mm subsystem where locking is greatly limited. >> >> As Jan mentions, we already have this serialisation in XFS, and I >> think it would be better first step to replicate that locking model >> in each filesystem that is supports DAX. I think this is a better >> direction because it moves towards solving a whole class of problems >> fileystem face with page fault serialisation, not just for DAX. > > Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the > fault / page_mkwrite callback so it doesn't provide the exclusion necessary > for DAX which needs exclusive access to the page given range in the page > cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot > excercise (at least from DAX POV). > Hi Jan. So you got me confused above. You say: "DAX which needs exclusive access to the page given range in the page cache" but DAX and page-cache are mutually exclusive. I guess you meant the VMA range, or the inode->mapping range (which one is it) Actually I do not understand this race you guys found at all. (Please bear with me sorry for being slow) If two threads of the same VMA fault on the same pte (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table) then the mm knows how to handle this just fine. If two processes, ie two VMAs fault on the same inode->mapping. Then an inode wide lock like XFS's to protect against i_size-change / truncate is more than enough. Because with DAX there is no inode->mapping "mapping" at all. You have the call into the FS with get_block() to replace "holes" (zero pages) with real allocated blocks, on WRITE faults, but this conversion should be protected inside the FS already. Then there is the atomic exchange of the PTE which is fine. (And vis versa with holes mapping and writes) There is no global "mapping" radix-tree shared between VMAs like we are used to. Please explain to me the races you are seeing. I would love to also see them with xfs. I think there they should not happen. > So regardless whether the lock will be a fs-private one or in > address_space, DAX needs something like the range lock Kirill suggested. > Having the range lock in fs-private part of inode has the advantage that > only filesystems supporting DAX / punch hole will pay the memory overhead. > OTOH most major filesystems need it so the savings would be IMO noticeable punch-hole is truncate for me. With the xfs model of read-write lock where truncate takes write, any fault taking read before executing the fault looks good for the FS side of things. I guess you mean the optimization of the radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty. Please explain. Do you have a reproducer of this race. I would need to understand this. Thanks Boaz > only for tiny systems using special fs etc. So I'm undecided whether > putting the lock in address_space and doing the locking in generic > pagefault / truncate helpers is a better choice or not. > > Honza > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Aug 11, 2015 at 05:31:04PM +0300, Boaz Harrosh wrote: > On 08/11/2015 04:50 PM, Jan Kara wrote: > > On Tue 11-08-15 19:37:08, Dave Chinner wrote: > >>>> The patch below tries to recover some scalability for DAX by introducing > >>>> per-mapping range lock. > >>> > >>> So this grows noticeably (3 longs if I'm right) struct address_space and > >>> thus struct inode just for DAX. That looks like a waste but I don't see an > >>> easy solution. > >>> > >>> OTOH filesystems in normal mode might want to use the range lock as well to > >>> provide truncate / punch hole vs page fault exclusion (XFS already has a > >>> private rwsem for this and ext4 needs something as well) and at that point > >>> growing generic struct inode would be acceptable for me. > >> > >> It sounds to me like the way DAX has tried to solve this race is the > >> wrong direction. We really need to drive the truncate/page fault > >> serialisation higher up the stack towards the filesystem, not deeper > >> into the mm subsystem where locking is greatly limited. > >> > >> As Jan mentions, we already have this serialisation in XFS, and I > >> think it would be better first step to replicate that locking model > >> in each filesystem that is supports DAX. I think this is a better > >> direction because it moves towards solving a whole class of problems > >> fileystem face with page fault serialisation, not just for DAX. > > > > Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the > > fault / page_mkwrite callback so it doesn't provide the exclusion necessary > > for DAX which needs exclusive access to the page given range in the page > > cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot > > excercise (at least from DAX POV). > > > > Hi Jan. So you got me confused above. You say: > "DAX which needs exclusive access to the page given range in the page cache" > > but DAX and page-cache are mutually exclusive. I guess you meant the VMA > range, or the inode->mapping range (which one is it) The second -- pgoff range within the inode->mapping. > Actually I do not understand this race you guys found at all. (Please bear with > me sorry for being slow) > > If two threads of the same VMA fault on the same pte > (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table) > then the mm knows how to handle this just fine. It does. But only if we have struct page. See lock_page_or_retry() in filemap_fault(). Without lock_page() it's problematic. > If two processes, ie two VMAs fault on the same inode->mapping. Then an inode > wide lock like XFS's to protect against i_size-change / truncate is more than > enough. We also used lock_page() to make sure we shoot out all pages as we don't exclude page faults during truncate. Consider this race: <fault> <truncate> get_block check i_size update i_size unmap setup pte With normal page cache we make sure that all pages beyond i_size is dropped using lock_page() in truncate_inode_pages_range(). For DAX we need a way to stop all page faults to the pgoff range before doing unmap. > Because with DAX there is no inode->mapping "mapping" at all. You have the call > into the FS with get_block() to replace "holes" (zero pages) with real allocated > blocks, on WRITE faults, but this conversion should be protected inside the FS > already. Then there is the atomic exchange of the PTE which is fine. > (And vis versa with holes mapping and writes) Having unmap_mapping_range() in PMD fault handling is very unfortunate. Go to rmap just to solve page fault is very wrong. BTW, we need to do it in write path too. I'm not convinced that all these "let's avoid backing storage allocation" in DAX code is not layering violation. I think the right place to solve this is filesystem. And we have almost all required handles for this in place. We only need to change vm_ops->page_mkwrite() interface to be able to return different page than what was given on input. > > So regardless whether the lock will be a fs-private one or in > > address_space, DAX needs something like the range lock Kirill suggested. > > Having the range lock in fs-private part of inode has the advantage that > > only filesystems supporting DAX / punch hole will pay the memory overhead. > > OTOH most major filesystems need it so the savings would be IMO noticeable > > punch-hole is truncate for me. With the xfs model of read-write lock where > truncate takes write, any fault taking read before executing the fault looks > good for the FS side of things. I guess you mean the optimization of the > radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty. Hm. Where does XFS take this read-write lock in fault path? IIUC, truncation vs. page fault serialization relies on i_size being updated before doing truncate_pagecache() and checking i_size under page_lock() on fault side. We don't have i_size fence for punch hole. BTW, how things like ext4_collapse_range() can be safe wrt parallel page fault? Ted?
On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote: <> >> Hi Jan. So you got me confused above. You say: >> "DAX which needs exclusive access to the page given range in the page cache" >> >> but DAX and page-cache are mutually exclusive. I guess you meant the VMA >> range, or the inode->mapping range (which one is it) > > The second -- pgoff range within the inode->mapping. > So yes this is what I do not understand with DAX the inode->mapping radix-tree is empty. >> Actually I do not understand this race you guys found at all. (Please bear with >> me sorry for being slow) >> >> If two threads of the same VMA fault on the same pte >> (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table) >> then the mm knows how to handle this just fine. > > It does. But only if we have struct page. See lock_page_or_retry() in > filemap_fault(). Without lock_page() it's problematic. > >> If two processes, ie two VMAs fault on the same inode->mapping. Then an inode >> wide lock like XFS's to protect against i_size-change / truncate is more than >> enough. > > We also used lock_page() to make sure we shoot out all pages as we don't > exclude page faults during truncate. Consider this race: > > <fault> <truncate> > get_block > check i_size > update i_size > unmap > setup pte > Please consider this senario then: <fault> <truncate> read_lock(inode) get_block check i_size read_unlock(inode) write_lock(inode) update i_size * remove allocated blocks unmap write_unlock(inode) setup pte IS what you suppose to do in xfs > With normal page cache we make sure that all pages beyond i_size is > dropped using lock_page() in truncate_inode_pages_range(). > Yes there is no truncate_inode_pages_range() in DAX again radix tree is empty. Please do you have a reproducer I would like to see this race and also experiment with xfs (I guess you saw it in ext4) > For DAX we need a way to stop all page faults to the pgoff range before > doing unmap. > Why ? >> Because with DAX there is no inode->mapping "mapping" at all. You have the call >> into the FS with get_block() to replace "holes" (zero pages) with real allocated >> blocks, on WRITE faults, but this conversion should be protected inside the FS >> already. Then there is the atomic exchange of the PTE which is fine. >> (And vis versa with holes mapping and writes) > > Having unmap_mapping_range() in PMD fault handling is very unfortunate. > Go to rmap just to solve page fault is very wrong. > BTW, we need to do it in write path too. > Only the write path and only when we exchange a zero-page (hole) with a new allocated (written to) page. Both write fault and/or write-path > I'm not convinced that all these "let's avoid backing storage allocation" > in DAX code is not layering violation. I think the right place to solve > this is filesystem. And we have almost all required handles for this in > place. We only need to change vm_ops->page_mkwrite() interface to be able > to return different page than what was given on input. > What? there is no page returned for DAX page_mkwrite(), it is all insert_mixed with direct pmd. Ha I think I see what you are tumbling on. Maybe it is the zero-pages when read-mapping holes. A solution I have, (And is working for a year now) is have only a single zero-page per inode->mapping, inserted at all the holes. and again radix-tree is kept clean always. This both saves memory and avoids the race on the (always empty) radix tree. Tell me if you want that I send a patch there is a small trick I do at vm_ops->page_mkwrite(): /* our zero page doesn't really hold the correct offset to the file in * page->index so vmf->pgoff is incorrect, lets fix that */ vmf->pgoff = vma->vm_pgoff + (((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT); /* call fault handler to get a real page for writing */ return __page_fault(vma, vmf); Again the return from page_mkwrite() && __page_fault(WRITE_CASE) is always VM_FAULT_NOPAGE, right? >>> So regardless whether the lock will be a fs-private one or in >>> address_space, DAX needs something like the range lock Kirill suggested. >>> Having the range lock in fs-private part of inode has the advantage that >>> only filesystems supporting DAX / punch hole will pay the memory overhead. >>> OTOH most major filesystems need it so the savings would be IMO noticeable >> >> punch-hole is truncate for me. With the xfs model of read-write lock where >> truncate takes write, any fault taking read before executing the fault looks >> good for the FS side of things. I guess you mean the optimization of the >> radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty. > > Hm. Where does XFS take this read-write lock in fault path? > > IIUC, truncation vs. page fault serialization relies on i_size being > updated before doing truncate_pagecache() and checking i_size under > page_lock() on fault side. We don't have i_size fence for punch hole. > again truncate_pagecache() is NONE. And yes the read-write locking will protect punch-hole just as truncate see my locking senario above. > BTW, how things like ext4_collapse_range() can be safe wrt parallel page > fault? Ted? > Thanks Boaz -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
VGhlIHJhY2UgdGhhdCB5b3UncmUgbm90IHNlZWluZyBpcyBwYWdlIGZhdWx0IHZzIHBhZ2UgZmF1 bHQuICBUd28gdGhyZWFkcyBlYWNoIGF0dGVtcHQgdG8gc3RvcmUgYSBieXRlIHRvIGRpZmZlcmVu dCBsb2NhdGlvbnMgb24gdGhlIHNhbWUgcGFnZS4gIFdpdGggYSByZWFkLW11dGV4IHRvIGV4Y2x1 ZGUgdHJ1bmNhdGVzLCBlYWNoIHRocmVhZCBjYWxscyAtPmdldF9ibG9jay4gIE9uZSBvZiB0aGUg dGhyZWFkcyBnZXRzIGJhY2sgYSBidWZmZXIgbWFya2VkIGFzIEJIX05ldyBhbmQgY2FsbHMgbWVt c2V0KCkgdG8gY2xlYXIgdGhlIHBhZ2UuICBUaGUgb3RoZXIgdGhyZWFkIGdldHMgYmFjayBhIGJ1 ZmZlciB3aGljaCBpc24ndCBtYXJrZWQgYXMgQkhfTmV3IGFuZCBzaW1wbHkgaW5zZXJ0cyB0aGUg bWFwcGluZywgcmV0dXJuaW5nIHRvIHVzZXJzcGFjZSwgd2hpY2ggc3RvcmVzIHRoZSBieXRlIC4u LiBqdXN0IGluIHRpbWUgZm9yIHRoZSBvdGhlciB0aHJlYWQncyBtZW1zZXQoKSB0byB3cml0ZSBh IHplcm8gb3ZlciB0aGUgdG9wIG9mIGl0Lg0KDQpPaCwgYW5kIGlmIGl0J3MgYSBsb2FkIHJhY2lu ZyBhZ2FpbnN0IGEgc3RvcmUsIHlvdSBjb3VsZCByZWFkIGRhdGEgdGhhdCB1c2VkIHRvIGJlIGlu IHRoaXMgYmxvY2sgdGhlIGxhc3QgdGltZSBpdCB3YXMgdXNlZDsgbWF5YmUgdGhlIGNvbnRlbnRz IG9mIC9ldGMvc2hhZG93Lg0KDQpTbyBpZiB3ZSB3YW50IHRvIGJlIGFibGUgdG8gaGFuZGxlIG1v cmUgdGhhbiBvbmUgcGFnZSBmYXVsdCBhdCBhIHRpbWUgcGVyIGZpbGUgKC4uLiBhbmQgSSB0aGlu ayB3ZSBkbyAuLi4pLCB3ZSBuZWVkIHRvIGhhdmUgZXhjbHVzaXZlIGFjY2VzcyB0byBhIHJhbmdl IG9mIHRoZSBmaWxlIHdoaWxlIHdlJ3JlIGNhbGxpbmcgLT5nZXRfYmxvY2soKSwgYW5kIGZvciBh IGNlcnRhaW4gYW1vdW50IG9mIHRpbWUgYWZ0ZXJ3YXJkcy4gIFdpdGggbm9uLURBWCBmaWxlcywg dGhpcyBpcyB0aGUgcGFnZSBsb2NrLiAgTXkgb3JpZ2luYWwgcHJvcG9zYWwgZm9yIHNvbHZpbmcg dGhpcyB3YXMgdG8gZW5oYW5jZSB0aGUgcGFnZSBjYWNoZSByYWRpeCB0cmVlIHRvIGJlIGFibGUg dG8gbG9jayBzb21ldGhpbmcgb3RoZXIgdGhhbiBhIHBhZ2UsIGJ1dCB0aGF0IGFsc28gaGFzIGNv bXBsZXhpdGllcy4NCg0KLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCkZyb206IEJvYXogSGFy cm9zaCBbbWFpbHRvOmJvYXpAcGxleGlzdG9yLmNvbV0gDQpTZW50OiBUdWVzZGF5LCBBdWd1c3Qg MTEsIDIwMTUgNzozMSBBTQ0KVG86IEphbiBLYXJhOyBEYXZlIENoaW5uZXINCkNjOiBLaXJpbGwg QS4gU2h1dGVtb3Y7IEFuZHJldyBNb3J0b247IFdpbGNveCwgTWF0dGhldyBSOyBsaW51eC1tbUBr dmFjay5vcmc7IGxpbnV4LWZzZGV2ZWxAdmdlci5rZXJuZWwub3JnOyBsaW51eC1rZXJuZWxAdmdl ci5rZXJuZWwub3JnOyBEYXZpZGxvaHIgQnVlc28NClN1YmplY3Q6IFJlOiBbUEFUQ0gsIFJGQyAy LzJdIGRheDogdXNlIHJhbmdlX2xvY2sgaW5zdGVhZCBvZiBpX21tYXBfbG9jaw0KDQpPbiAwOC8x MS8yMDE1IDA0OjUwIFBNLCBKYW4gS2FyYSB3cm90ZToNCj4gT24gVHVlIDExLTA4LTE1IDE5OjM3 OjA4LCBEYXZlIENoaW5uZXIgd3JvdGU6DQo+Pj4+IFRoZSBwYXRjaCBiZWxvdyB0cmllcyB0byBy ZWNvdmVyIHNvbWUgc2NhbGFiaWxpdHkgZm9yIERBWCBieSBpbnRyb2R1Y2luZw0KPj4+PiBwZXIt bWFwcGluZyByYW5nZSBsb2NrLg0KPj4+DQo+Pj4gU28gdGhpcyBncm93cyBub3RpY2VhYmx5ICgz IGxvbmdzIGlmIEknbSByaWdodCkgc3RydWN0IGFkZHJlc3Nfc3BhY2UgYW5kDQo+Pj4gdGh1cyBz dHJ1Y3QgaW5vZGUganVzdCBmb3IgREFYLiBUaGF0IGxvb2tzIGxpa2UgYSB3YXN0ZSBidXQgSSBk b24ndCBzZWUgYW4NCj4+PiBlYXN5IHNvbHV0aW9uLg0KPj4+DQo+Pj4gT1RPSCBmaWxlc3lzdGVt cyBpbiBub3JtYWwgbW9kZSBtaWdodCB3YW50IHRvIHVzZSB0aGUgcmFuZ2UgbG9jayBhcyB3ZWxs IHRvDQo+Pj4gcHJvdmlkZSB0cnVuY2F0ZSAvIHB1bmNoIGhvbGUgdnMgcGFnZSBmYXVsdCBleGNs dXNpb24gKFhGUyBhbHJlYWR5IGhhcyBhDQo+Pj4gcHJpdmF0ZSByd3NlbSBmb3IgdGhpcyBhbmQg ZXh0NCBuZWVkcyBzb21ldGhpbmcgYXMgd2VsbCkgYW5kIGF0IHRoYXQgcG9pbnQNCj4+PiBncm93 aW5nIGdlbmVyaWMgc3RydWN0IGlub2RlIHdvdWxkIGJlIGFjY2VwdGFibGUgZm9yIG1lLg0KPj4N Cj4+IEl0IHNvdW5kcyB0byBtZSBsaWtlIHRoZSB3YXkgREFYIGhhcyB0cmllZCB0byBzb2x2ZSB0 aGlzIHJhY2UgaXMgdGhlDQo+PiB3cm9uZyBkaXJlY3Rpb24uIFdlIHJlYWxseSBuZWVkIHRvIGRy aXZlIHRoZSB0cnVuY2F0ZS9wYWdlIGZhdWx0DQo+PiBzZXJpYWxpc2F0aW9uIGhpZ2hlciB1cCB0 aGUgc3RhY2sgdG93YXJkcyB0aGUgZmlsZXN5c3RlbSwgbm90IGRlZXBlcg0KPj4gaW50byB0aGUg bW0gc3Vic3lzdGVtIHdoZXJlIGxvY2tpbmcgaXMgZ3JlYXRseSBsaW1pdGVkLg0KPj4NCj4+IEFz IEphbiBtZW50aW9ucywgd2UgYWxyZWFkeSBoYXZlIHRoaXMgc2VyaWFsaXNhdGlvbiBpbiBYRlMs IGFuZCBJDQo+PiB0aGluayBpdCB3b3VsZCBiZSBiZXR0ZXIgZmlyc3Qgc3RlcCB0byByZXBsaWNh dGUgdGhhdCBsb2NraW5nIG1vZGVsDQo+PiBpbiBlYWNoIGZpbGVzeXN0ZW0gdGhhdCBpcyBzdXBw b3J0cyBEQVguIEkgdGhpbmsgdGhpcyBpcyBhIGJldHRlcg0KPj4gZGlyZWN0aW9uIGJlY2F1c2Ug aXQgbW92ZXMgdG93YXJkcyBzb2x2aW5nIGEgd2hvbGUgY2xhc3Mgb2YgcHJvYmxlbXMNCj4+IGZp bGV5c3RlbSBmYWNlIHdpdGggcGFnZSBmYXVsdCBzZXJpYWxpc2F0aW9uLCBub3QganVzdCBmb3Ig REFYLg0KPiANCj4gV2VsbCwgYnV0IGF0IGxlYXN0IGluIFhGUyB5b3UgdGFrZSBYRlNfTU1BUExP Q0sgaW4gc2hhcmVkIG1vZGUgZm9yIHRoZQ0KPiBmYXVsdCAvIHBhZ2VfbWt3cml0ZSBjYWxsYmFj ayBzbyBpdCBkb2Vzbid0IHByb3ZpZGUgdGhlIGV4Y2x1c2lvbiBuZWNlc3NhcnkNCj4gZm9yIERB WCB3aGljaCBuZWVkcyBleGNsdXNpdmUgYWNjZXNzIHRvIHRoZSBwYWdlIGdpdmVuIHJhbmdlIGlu IHRoZSBwYWdlDQo+IGNhY2hlLiBBbmQgcmVwbGFjaW5nIGlfbW1hcF9sb2NrIHdpdGggZnMtcHJp dmF0ZSBtbWFwIHJ3c2VtIGlzIGEgbW9vdA0KPiBleGNlcmNpc2UgKGF0IGxlYXN0IGZyb20gREFY IFBPVikuDQo+IA0KDQpIaSBKYW4uIFNvIHlvdSBnb3QgbWUgY29uZnVzZWQgYWJvdmUuIFlvdSBz YXk6DQoJIkRBWCB3aGljaCBuZWVkcyBleGNsdXNpdmUgYWNjZXNzIHRvIHRoZSBwYWdlIGdpdmVu IHJhbmdlIGluIHRoZSBwYWdlIGNhY2hlIg0KDQpidXQgREFYIGFuZCBwYWdlLWNhY2hlIGFyZSBt dXR1YWxseSBleGNsdXNpdmUuIEkgZ3Vlc3MgeW91IG1lYW50IHRoZSBWTUENCnJhbmdlLCBvciB0 aGUgaW5vZGUtPm1hcHBpbmcgcmFuZ2UgKHdoaWNoIG9uZSBpcyBpdCkNCg0KQWN0dWFsbHkgSSBk byBub3QgdW5kZXJzdGFuZCB0aGlzIHJhY2UgeW91IGd1eXMgZm91bmQgYXQgYWxsLiAoUGxlYXNl IGJlYXIgd2l0aA0KbWUgc29ycnkgZm9yIGJlaW5nIHNsb3cpDQoNCklmIHR3byB0aHJlYWRzIG9m IHRoZSBzYW1lIFZNQSBmYXVsdCBvbiB0aGUgc2FtZSBwdGUNCihJJ20gbm90IHN1cmUgaG93IHlv dSBjYWxsIGl0IEkgbWVhbiBhIHNpbmdsZSA0ayBlbnRyeSBhdCBlYWNoIFZNQXMgcGFnZS10YWJs ZSkNCnRoZW4gdGhlIG1tIGtub3dzIGhvdyB0byBoYW5kbGUgdGhpcyBqdXN0IGZpbmUuDQoNCklm IHR3byBwcm9jZXNzZXMsIGllIHR3byBWTUFzIGZhdWx0IG9uIHRoZSBzYW1lIGlub2RlLT5tYXBw aW5nLiBUaGVuIGFuIGlub2RlDQp3aWRlIGxvY2sgbGlrZSBYRlMncyB0byBwcm90ZWN0IGFnYWlu c3QgaV9zaXplLWNoYW5nZSAvIHRydW5jYXRlIGlzIG1vcmUgdGhhbg0KZW5vdWdoLg0KQmVjYXVz ZSB3aXRoIERBWCB0aGVyZSBpcyBubyBpbm9kZS0+bWFwcGluZyAibWFwcGluZyIgYXQgYWxsLiBZ b3UgaGF2ZSB0aGUgY2FsbA0KaW50byB0aGUgRlMgd2l0aCBnZXRfYmxvY2soKSB0byByZXBsYWNl ICJob2xlcyIgKHplcm8gcGFnZXMpIHdpdGggcmVhbCBhbGxvY2F0ZWQNCmJsb2Nrcywgb24gV1JJ VEUgZmF1bHRzLCBidXQgdGhpcyBjb252ZXJzaW9uIHNob3VsZCBiZSBwcm90ZWN0ZWQgaW5zaWRl IHRoZSBGUw0KYWxyZWFkeS4gVGhlbiB0aGVyZSBpcyB0aGUgYXRvbWljIGV4Y2hhbmdlIG9mIHRo ZSBQVEUgd2hpY2ggaXMgZmluZS4NCihBbmQgdmlzIHZlcnNhIHdpdGggaG9sZXMgbWFwcGluZyBh bmQgd3JpdGVzKQ0KDQpUaGVyZSBpcyBubyBnbG9iYWwgIm1hcHBpbmciIHJhZGl4LXRyZWUgc2hh cmVkIGJldHdlZW4gVk1BcyBsaWtlIHdlIGFyZQ0KdXNlZCB0by4NCg0KUGxlYXNlIGV4cGxhaW4g dG8gbWUgdGhlIHJhY2VzIHlvdSBhcmUgc2VlaW5nLiBJIHdvdWxkIGxvdmUgdG8gYWxzbyBzZWUg dGhlbQ0Kd2l0aCB4ZnMuIEkgdGhpbmsgdGhlcmUgdGhleSBzaG91bGQgbm90IGhhcHBlbi4NCg0K PiBTbyByZWdhcmRsZXNzIHdoZXRoZXIgdGhlIGxvY2sgd2lsbCBiZSBhIGZzLXByaXZhdGUgb25l IG9yIGluDQo+IGFkZHJlc3Nfc3BhY2UsIERBWCBuZWVkcyBzb21ldGhpbmcgbGlrZSB0aGUgcmFu Z2UgbG9jayBLaXJpbGwgc3VnZ2VzdGVkLg0KPiBIYXZpbmcgdGhlIHJhbmdlIGxvY2sgaW4gZnMt cHJpdmF0ZSBwYXJ0IG9mIGlub2RlIGhhcyB0aGUgYWR2YW50YWdlIHRoYXQNCj4gb25seSBmaWxl c3lzdGVtcyBzdXBwb3J0aW5nIERBWCAvIHB1bmNoIGhvbGUgd2lsbCBwYXkgdGhlIG1lbW9yeSBv dmVyaGVhZC4NCj4gT1RPSCBtb3N0IG1ham9yIGZpbGVzeXN0ZW1zIG5lZWQgaXQgc28gdGhlIHNh dmluZ3Mgd291bGQgYmUgSU1PIG5vdGljZWFibGUNCg0KcHVuY2gtaG9sZSBpcyB0cnVuY2F0ZSBm b3IgbWUuIFdpdGggdGhlIHhmcyBtb2RlbCBvZiByZWFkLXdyaXRlIGxvY2sgd2hlcmUNCnRydW5j YXRlIHRha2VzIHdyaXRlLCBhbnkgZmF1bHQgdGFraW5nIHJlYWQgYmVmb3JlIGV4ZWN1dGluZyB0 aGUgZmF1bHQgbG9va3MNCmdvb2QgZm9yIHRoZSBGUyBzaWRlIG9mIHRoaW5ncy4gSSBndWVzcyB5 b3UgbWVhbiB0aGUgb3B0aW1pemF0aW9uIG9mIHRoZQ0KcmFkaXgtdHJlZSBsb2NrLiBCdXQgeW91 IHNlZSBEQVggZG9lcyBub3QgaGF2ZSBhIHJhZGl4LXRyZWUsIGllIGl0IGlzIGVtcHR5Lg0KDQpQ bGVhc2UgZXhwbGFpbi4gRG8geW91IGhhdmUgYSByZXByb2R1Y2VyIG9mIHRoaXMgcmFjZS4gSSB3 b3VsZCBuZWVkIHRvIHVuZGVyc3RhbmQNCnRoaXMuDQoNClRoYW5rcw0KQm9heg0KDQo+IG9ubHkg Zm9yIHRpbnkgc3lzdGVtcyB1c2luZyBzcGVjaWFsIGZzIGV0Yy4gU28gSSdtIHVuZGVjaWRlZCB3 aGV0aGVyDQo+IHB1dHRpbmcgdGhlIGxvY2sgaW4gYWRkcmVzc19zcGFjZSBhbmQgZG9pbmcgdGhl IGxvY2tpbmcgaW4gZ2VuZXJpYw0KPiBwYWdlZmF1bHQgLyB0cnVuY2F0ZSBoZWxwZXJzIGlzIGEg YmV0dGVyIGNob2ljZSBvciBub3QuDQo+ICANCj4gCQkJCQkJCQlIb256YQ0KPiANCg0K -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/11/2015 07:51 PM, Wilcox, Matthew R wrote: > The race that you're not seeing is page fault vs page fault. Two > threads each attempt to store a byte to different locations on the > same page. With a read-mutex to exclude truncates, each thread calls > ->get_block. One of the threads gets back a buffer marked as BH_New > and calls memset() to clear the page. The other thread gets back a > buffer which isn't marked as BH_New and simply inserts the mapping, > returning to userspace, which stores the byte ... just in time for > the other thread's memset() to write a zero over the top of it. > So I guess all you need is to zero it at the FS allocator like a __GFP_ZERO Perhaps you invent a new flag to pass to ->get_block() like a __GB_FOR_MMAP that will make the FS zero the block in the case it was newly allocated. Sure there is some locking going on inside the FS so to not allocate two blocks for the same inode offset. Then memset_nt() inside or before that lock. ie. In the presence of __GB_FOR_MMAP the FS will internally convert any BH_New to a memset_nt() and regular return. > Oh, and if it's a load racing against a store, you could read data > that used to be in this block the last time it was used; maybe the > contents of /etc/shadow. > Do you have any kind of test like this? I wish we could exercise this and see that we actually solved it. > So if we want to be able to handle more than one page fault at a time > per file (... and I think we do ...), we need to have exclusive > access to a range of the file while we're calling ->get_block(), and > for a certain amount of time afterwards. With non-DAX files, this is > the page lock. My original proposal for solving this was to enhance > the page cache radix tree to be able to lock something other than a > page, but that also has complexities. > Yes this could be done with a bit-lock at the RADIX_TREE_EXCEPTIONAL_ENTRY bits of a radix_tree_exception() entry. These are only used by shmem radix-trees and will never conflict with DAX But it means you will need to populate the radix-tree with radix_tree_exception() entries for every block accessed which will be a great pity. I still think the above extra flag to ->get_block() taps into filesystem already existing infra, and will cost much less pain. (And is much more efficient say in hot path like application already did fallocate for performance) Thanks Boaz > -----Original Message----- > From: Boaz Harrosh [mailto:boaz@plexistor.com] > Sent: Tuesday, August 11, 2015 7:31 AM > To: Jan Kara; Dave Chinner > Cc: Kirill A. Shutemov; Andrew Morton; Wilcox, Matthew R; linux-mm@kvack.org; linux-fsdevel@vger.kernel.org; linux-kernel@vger.kernel.org; Davidlohr Bueso > Subject: Re: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock > > On 08/11/2015 04:50 PM, Jan Kara wrote: >> On Tue 11-08-15 19:37:08, Dave Chinner wrote: >>>>> The patch below tries to recover some scalability for DAX by introducing >>>>> per-mapping range lock. >>>> >>>> So this grows noticeably (3 longs if I'm right) struct address_space and >>>> thus struct inode just for DAX. That looks like a waste but I don't see an >>>> easy solution. >>>> >>>> OTOH filesystems in normal mode might want to use the range lock as well to >>>> provide truncate / punch hole vs page fault exclusion (XFS already has a >>>> private rwsem for this and ext4 needs something as well) and at that point >>>> growing generic struct inode would be acceptable for me. >>> >>> It sounds to me like the way DAX has tried to solve this race is the >>> wrong direction. We really need to drive the truncate/page fault >>> serialisation higher up the stack towards the filesystem, not deeper >>> into the mm subsystem where locking is greatly limited. >>> >>> As Jan mentions, we already have this serialisation in XFS, and I >>> think it would be better first step to replicate that locking model >>> in each filesystem that is supports DAX. I think this is a better >>> direction because it moves towards solving a whole class of problems >>> fileystem face with page fault serialisation, not just for DAX. >> >> Well, but at least in XFS you take XFS_MMAPLOCK in shared mode for the >> fault / page_mkwrite callback so it doesn't provide the exclusion necessary >> for DAX which needs exclusive access to the page given range in the page >> cache. And replacing i_mmap_lock with fs-private mmap rwsem is a moot >> excercise (at least from DAX POV). >> > > Hi Jan. So you got me confused above. You say: > "DAX which needs exclusive access to the page given range in the page cache" > > but DAX and page-cache are mutually exclusive. I guess you meant the VMA > range, or the inode->mapping range (which one is it) > > Actually I do not understand this race you guys found at all. (Please bear with > me sorry for being slow) > > If two threads of the same VMA fault on the same pte > (I'm not sure how you call it I mean a single 4k entry at each VMAs page-table) > then the mm knows how to handle this just fine. > > If two processes, ie two VMAs fault on the same inode->mapping. Then an inode > wide lock like XFS's to protect against i_size-change / truncate is more than > enough. > Because with DAX there is no inode->mapping "mapping" at all. You have the call > into the FS with get_block() to replace "holes" (zero pages) with real allocated > blocks, on WRITE faults, but this conversion should be protected inside the FS > already. Then there is the atomic exchange of the PTE which is fine. > (And vis versa with holes mapping and writes) > > There is no global "mapping" radix-tree shared between VMAs like we are > used to. > > Please explain to me the races you are seeing. I would love to also see them > with xfs. I think there they should not happen. > >> So regardless whether the lock will be a fs-private one or in >> address_space, DAX needs something like the range lock Kirill suggested. >> Having the range lock in fs-private part of inode has the advantage that >> only filesystems supporting DAX / punch hole will pay the memory overhead. >> OTOH most major filesystems need it so the savings would be IMO noticeable > > punch-hole is truncate for me. With the xfs model of read-write lock where > truncate takes write, any fault taking read before executing the fault looks > good for the FS side of things. I guess you mean the optimization of the > radix-tree lock. But you see DAX does not have a radix-tree, ie it is empty. > > Please explain. Do you have a reproducer of this race. I would need to understand > this. > > Thanks > Boaz > >> only for tiny systems using special fs etc. So I'm undecided whether >> putting the lock in address_space and doing the locking in generic >> pagefault / truncate helpers is a better choice or not. >> >> Honza >> > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Aug 11, 2015 at 07:17:12PM +0300, Boaz Harrosh wrote: > On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote: > > We also used lock_page() to make sure we shoot out all pages as we don't > > exclude page faults during truncate. Consider this race: > > > > <fault> <truncate> > > get_block > > check i_size > > update i_size > > unmap > > setup pte > > > > Please consider this senario then: > > <fault> <truncate> > read_lock(inode) > > get_block > check i_size > > read_unlock(inode) > > write_lock(inode) > > update i_size > * remove allocated blocks > unmap > > write_unlock(inode) > > setup pte > > IS what you suppose to do in xfs Do you realize that you describe a race? :-P Exactly in this scenario pfn your pte point to is not belong to the file anymore. Have fun. > > With normal page cache we make sure that all pages beyond i_size is > > dropped using lock_page() in truncate_inode_pages_range(). > > > > Yes there is no truncate_inode_pages_range() in DAX again radix tree is > empty. > > Please do you have a reproducer I would like to see this race and also > experiment with xfs (I guess you saw it in ext4) I don't. And I don't see how race like above can be FS-specific. All critical points in generic code. > > For DAX we need a way to stop all page faults to the pgoff range before > > doing unmap. > > > > Why ? Because you can end up with ptes pointing to pfns which fs consider not be part of the file. <truncate> <fault> unmap.. fault in pfn which unmap already unmapped ..continue unmap > >> Because with DAX there is no inode->mapping "mapping" at all. You have the call > >> into the FS with get_block() to replace "holes" (zero pages) with real allocated > >> blocks, on WRITE faults, but this conversion should be protected inside the FS > >> already. Then there is the atomic exchange of the PTE which is fine. > >> (And vis versa with holes mapping and writes) > > > > Having unmap_mapping_range() in PMD fault handling is very unfortunate. > > Go to rmap just to solve page fault is very wrong. > > BTW, we need to do it in write path too. > > > > Only the write path and only when we exchange a zero-page (hole) with > a new allocated (written to) page. Both write fault and/or write-path No. Always on new BH. We don't have anything (except rmap) to find out if any other process have zero page for the pgoff. > > I'm not convinced that all these "let's avoid backing storage allocation" > > in DAX code is not layering violation. I think the right place to solve > > this is filesystem. And we have almost all required handles for this in > > place. We only need to change vm_ops->page_mkwrite() interface to be able > > to return different page than what was given on input. > > > > What? there is no page returned for DAX page_mkwrite(), it is all insert_mixed > with direct pmd. That was bogus idea, please ignore. > > Hm. Where does XFS take this read-write lock in fault path? > > > > IIUC, truncation vs. page fault serialization relies on i_size being > > updated before doing truncate_pagecache() and checking i_size under > > page_lock() on fault side. We don't have i_size fence for punch hole. > > > > again truncate_pagecache() is NONE. > And yes the read-write locking will protect punch-hole just as truncate > see my locking senario above. Do you mean as racy as your truncate scenario? ;-P
On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote: > The race that you're not seeing is page fault vs page fault. Two > threads each attempt to store a byte to different locations on the > same page. With a read-mutex to exclude truncates, each thread > calls ->get_block. One of the threads gets back a buffer marked > as BH_New and calls memset() to clear the page. The other thread > gets back a buffer which isn't marked as BH_New and simply inserts > the mapping, returning to userspace, which stores the byte ... > just in time for the other thread's memset() to write a zero over > the top of it. So, this is not a truncate race that the XFS MMAPLOCK solves. However, that doesn't mean that the DAX code needs to add locking to solve it. The race here is caused by block initialisation being unserialised after a ->get_block call allocates the block (which the filesystem serialises via internal locking). Hence two simultaneous ->get_block calls to the same block is guaranteed to have the DAX block initialisation race with the second ->get_block call that says the block is already allocated. IOWs, the way to handle this is to have the ->get_block call handle the block zeroing for new blocks instead of doing it after the fact in the generic DAX code where there is no fine-grained serialisation object available. By calling dax_clear_blocks() in the ->get_block callback, the filesystem can ensure that the second racing call will only make progress once the block has been fully initialised by the first call. IMO the fix is - again - to move the functionality into the filesystem where we already have the necessary exclusion in place to avoid this race condition entirely. Cheers, Dave.
On 08/11/2015 11:26 PM, Kirill A. Shutemov wrote: > On Tue, Aug 11, 2015 at 07:17:12PM +0300, Boaz Harrosh wrote: >> On 08/11/2015 06:28 PM, Kirill A. Shutemov wrote: >>> We also used lock_page() to make sure we shoot out all pages as we don't >>> exclude page faults during truncate. Consider this race: >>> >>> <fault> <truncate> >>> get_block >>> check i_size >>> update i_size >>> unmap >>> setup pte >>> >> >> Please consider this senario then: >> >> <fault> <truncate> >> read_lock(inode) >> >> get_block >> check i_size >> >> read_unlock(inode) >> >> write_lock(inode) >> >> update i_size >> * remove allocated blocks >> unmap >> >> write_unlock(inode) >> >> setup pte >> >> IS what you suppose to do in xfs > > Do you realize that you describe a race? :-P > > Exactly in this scenario pfn your pte point to is not belong to the file > anymore. Have fun. > Sorry yes I have written it wrong, I have now returned to read the actual code and the setup pte part is also part of the read lock inside the fault handler before the release of the r_lock. Da of course it is, it is the page_fault handler that does the vm_insert_mixed(vma,,pfn) and in the case of concurrent faults the second call to vm_insert_mixed will return -EBUSY which means all is well. So the only thing left is the fault-to-fault zero-the-page race as Matthew described and as Dave and me think we can make this part of the FS's get_block where it is more natural. Thanks Boaz -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/12/2015 12:48 AM, Dave Chinner wrote: > On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote: >> The race that you're not seeing is page fault vs page fault. Two >> threads each attempt to store a byte to different locations on the >> same page. With a read-mutex to exclude truncates, each thread >> calls ->get_block. One of the threads gets back a buffer marked >> as BH_New and calls memset() to clear the page. The other thread >> gets back a buffer which isn't marked as BH_New and simply inserts >> the mapping, returning to userspace, which stores the byte ... >> just in time for the other thread's memset() to write a zero over >> the top of it. > > So, this is not a truncate race that the XFS MMAPLOCK solves. > > However, that doesn't mean that the DAX code needs to add locking to > solve it. The race here is caused by block initialisation being > unserialised after a ->get_block call allocates the block (which the > filesystem serialises via internal locking). Hence two simultaneous > ->get_block calls to the same block is guaranteed to have the DAX > block initialisation race with the second ->get_block call that says > the block is already allocated. > > IOWs, the way to handle this is to have the ->get_block call handle > the block zeroing for new blocks instead of doing it after the fact > in the generic DAX code where there is no fine-grained serialisation > object available. By calling dax_clear_blocks() in the ->get_block > callback, the filesystem can ensure that the second racing call will > only make progress once the block has been fully initialised by the > first call. > > IMO the fix is - again - to move the functionality into the > filesystem where we already have the necessary exclusion in place to > avoid this race condition entirely. > Exactly, thanks > Cheers, > > Dave. > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed 12-08-15 07:48:22, Dave Chinner wrote: > On Tue, Aug 11, 2015 at 04:51:22PM +0000, Wilcox, Matthew R wrote: > > The race that you're not seeing is page fault vs page fault. Two > > threads each attempt to store a byte to different locations on the > > same page. With a read-mutex to exclude truncates, each thread > > calls ->get_block. One of the threads gets back a buffer marked > > as BH_New and calls memset() to clear the page. The other thread > > gets back a buffer which isn't marked as BH_New and simply inserts > > the mapping, returning to userspace, which stores the byte ... > > just in time for the other thread's memset() to write a zero over > > the top of it. > > So, this is not a truncate race that the XFS MMAPLOCK solves. > > However, that doesn't mean that the DAX code needs to add locking to > solve it. The race here is caused by block initialisation being > unserialised after a ->get_block call allocates the block (which the > filesystem serialises via internal locking). Hence two simultaneous > ->get_block calls to the same block is guaranteed to have the DAX > block initialisation race with the second ->get_block call that says > the block is already allocated. > > IOWs, the way to handle this is to have the ->get_block call handle > the block zeroing for new blocks instead of doing it after the fact > in the generic DAX code where there is no fine-grained serialisation > object available. By calling dax_clear_blocks() in the ->get_block > callback, the filesystem can ensure that the second racing call will > only make progress once the block has been fully initialised by the > first call. > > IMO the fix is - again - to move the functionality into the > filesystem where we already have the necessary exclusion in place to > avoid this race condition entirely. I'm somewhat sad to add even more functionality into the already loaded block mapping interface - we can already allocate delalloc blocks, unwritten blocks, uninitialized blocks, and now also pre-zeroed blocks. But I agree fs already synchronizes block allocation for a given inode so adding the pre-zeroing there is pretty easy. Also getting rid of unwritten extent handling from DAX code is a nice bonus so all in all I'm for this approach. Honza
diff --git a/fs/dax.c b/fs/dax.c index ed54efedade6..27a68eca698e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -333,6 +333,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; + struct range_lock mapping_lock; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; @@ -348,6 +349,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PAGE_SIZE; + /* do_cow_fault() takes the lock */ + if (!vmf->cow_page) { + range_lock_init(&mapping_lock, vmf->pgoff, vmf->pgoff); + range_lock(&mapping->mapping_lock, &mapping_lock); + } repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { @@ -369,8 +375,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, error = -EIO; goto unlock; } - } else { - i_mmap_lock_write(mapping); } error = get_block(inode, block, &bh, 0); @@ -390,8 +394,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto unlock; } else { - i_mmap_unlock_write(mapping); - return dax_load_hole(mapping, page, vmf); + error = dax_load_hole(mapping, page, vmf); + range_unlock(&mapping->mapping_lock, &mapping_lock); + return error; } } @@ -446,9 +451,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } - if (!page) - i_mmap_unlock_write(mapping); out: + if (!vmf->cow_page) + range_unlock(&mapping->mapping_lock, &mapping_lock); if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ @@ -460,10 +465,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (page) { unlock_page(page); page_cache_release(page); - } else { - i_mmap_unlock_write(mapping); } - goto out; } EXPORT_SYMBOL(__dax_fault); @@ -510,6 +512,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; + struct range_lock mapping_lock; unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; @@ -541,7 +544,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - i_mmap_lock_write(mapping); + range_lock_init(&mapping_lock, pgoff, pgoff + HPAGE_PMD_NR - 1); + range_lock(&mapping->mapping_lock, &mapping_lock); length = get_block(inode, block, &bh, write); if (length) return VM_FAULT_SIGBUS; @@ -568,9 +572,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * zero pages covering this hole */ if (buffer_new(&bh)) { - i_mmap_unlock_write(mapping); + range_unlock(&mapping->mapping_lock, &mapping_lock); unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_write(mapping); + range_lock(&mapping->mapping_lock, &mapping_lock); } /* @@ -624,7 +628,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - i_mmap_unlock_write(mapping); + range_unlock(&mapping->mapping_lock, &mapping_lock); return result; diff --git a/fs/inode.c b/fs/inode.c index e560535706ff..6a24144d679f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -343,6 +343,7 @@ void address_space_init_once(struct address_space *mapping) INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); init_rwsem(&mapping->i_mmap_rwsem); + range_lock_tree_init(&mapping->mapping_lock); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; diff --git a/include/linux/fs.h b/include/linux/fs.h index b6361e2e2a26..368e7208d4f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -30,6 +30,7 @@ #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/blk_types.h> +#include <linux/range_lock.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -429,6 +430,7 @@ struct address_space { atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ + struct range_lock_tree mapping_lock; /* lock_page() replacement for DAX */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ unsigned long nrshadows; /* number of shadow entries */ diff --git a/mm/memory.c b/mm/memory.c index 7f6a9563d5a6..b4898db7e4c4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2409,6 +2409,7 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { struct zap_details details; + struct range_lock mapping_lock; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -2426,10 +2427,17 @@ void unmap_mapping_range(struct address_space *mapping, if (details.last_index < details.first_index) details.last_index = ULONG_MAX; + if (IS_DAX(mapping->host)) { + /* Exclude fault under us */ + range_lock_init(&mapping_lock, hba, hba + hlen - 1); + range_lock(&mapping->mapping_lock, &mapping_lock); + } i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); + if (IS_DAX(mapping->host)) + range_unlock(&mapping->mapping_lock, &mapping_lock); } EXPORT_SYMBOL(unmap_mapping_range); @@ -2978,6 +2986,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct range_lock mapping_lock; struct page *fault_page, *new_page; struct mem_cgroup *memcg; spinlock_t *ptl; @@ -2996,6 +3006,15 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } + if (IS_DAX(file_inode(vma->vm_file))) { + /* + * The fault handler has no page to lock, so it holds + * mapping->mapping_lock to protect against truncate. + */ + range_lock_init(&mapping_lock, pgoff, pgoff); + range_unlock(&mapping->mapping_lock, &mapping_lock); + } + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; @@ -3010,12 +3029,6 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (fault_page) { unlock_page(fault_page); page_cache_release(fault_page); - } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. - */ - i_mmap_unlock_write(vma->vm_file->f_mapping); } goto uncharge_out; } @@ -3026,15 +3039,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (fault_page) { unlock_page(fault_page); page_cache_release(fault_page); - } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. - */ - i_mmap_unlock_write(vma->vm_file->f_mapping); } + if (IS_DAX(file_inode(vma->vm_file))) + range_unlock(&mapping->mapping_lock, &mapping_lock); return ret; uncharge_out: + if (IS_DAX(file_inode(vma->vm_file))) + range_unlock(&mapping->mapping_lock, &mapping_lock); mem_cgroup_cancel_charge(new_page, memcg); page_cache_release(new_page); return ret; diff --git a/mm/rmap.c b/mm/rmap.c index dcaad464aab0..3d509326d354 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -22,6 +22,7 @@ * * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem + * mapping->mapping_lock * page->flags PG_locked (lock_page) * mapping->i_mmap_rwsem * anon_vma->rwsem
As we don't have struct pages for DAX memory, Matthew had to find an replacement for lock_page() to avoid fault vs. truncate races. i_mmap_lock was used for that. Recently, Matthew had to convert locking to exclusive to address fault vs. fault races. And this kills scalability completely. The patch below tries to recover some scalability for DAX by introducing per-mapping range lock. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- fs/dax.c | 30 +++++++++++++++++------------- fs/inode.c | 1 + include/linux/fs.h | 2 ++ mm/memory.c | 35 +++++++++++++++++++++++------------ mm/rmap.c | 1 + 5 files changed, 44 insertions(+), 25 deletions(-)