From patchwork Mon Aug 10 15:14:24 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Kirill A. Shutemov" X-Patchwork-Id: 6985351 Return-Path: X-Original-To: patchwork-linux-fsdevel@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.29.136]) by patchwork1.web.kernel.org (Postfix) with ESMTP id CE4DD9F373 for ; Mon, 10 Aug 2015 15:15:06 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id A8C33206B2 for ; Mon, 10 Aug 2015 15:15:05 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 6850C2070A for ; Mon, 10 Aug 2015 15:15:04 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754218AbbHJPO7 (ORCPT ); Mon, 10 Aug 2015 11:14:59 -0400 Received: from mga01.intel.com ([192.55.52.88]:18891 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753458AbbHJPOc (ORCPT ); Mon, 10 Aug 2015 11:14:32 -0400 Received: from fmsmga001.fm.intel.com ([10.253.24.23]) by fmsmga101.fm.intel.com with ESMTP; 10 Aug 2015 08:14:31 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.15,646,1432623600"; d="scan'208";a="765696860" Received: from black.fi.intel.com ([10.237.72.157]) by fmsmga001.fm.intel.com with ESMTP; 10 Aug 2015 08:14:28 -0700 Received: by black.fi.intel.com (Postfix, from userid 1000) id 8534D1E9; Mon, 10 Aug 2015 18:14:27 +0300 (EEST) From: "Kirill A. Shutemov" To: Andrew Morton , Matthew Wilcox Cc: linux-mm@kvack.org, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org, Davidlohr Bueso , Jan Kara , "Kirill A. Shutemov" Subject: [PATCH, RFC 2/2] dax: use range_lock instead of i_mmap_lock Date: Mon, 10 Aug 2015 18:14:24 +0300 Message-Id: <1439219664-88088-3-git-send-email-kirill.shutemov@linux.intel.com> X-Mailer: git-send-email 2.5.0 In-Reply-To: <1439219664-88088-1-git-send-email-kirill.shutemov@linux.intel.com> References: <1439219664-88088-1-git-send-email-kirill.shutemov@linux.intel.com> Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-fsdevel@vger.kernel.org X-Spam-Status: No, score=-6.9 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_HI, RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP As we don't have struct pages for DAX memory, Matthew had to find an replacement for lock_page() to avoid fault vs. truncate races. i_mmap_lock was used for that. Recently, Matthew had to convert locking to exclusive to address fault vs. fault races. And this kills scalability completely. The patch below tries to recover some scalability for DAX by introducing per-mapping range lock. Signed-off-by: Kirill A. Shutemov --- fs/dax.c | 30 +++++++++++++++++------------- fs/inode.c | 1 + include/linux/fs.h | 2 ++ mm/memory.c | 35 +++++++++++++++++++++++------------ mm/rmap.c | 1 + 5 files changed, 44 insertions(+), 25 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index ed54efedade6..27a68eca698e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -333,6 +333,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; + struct range_lock mapping_lock; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; @@ -348,6 +349,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PAGE_SIZE; + /* do_cow_fault() takes the lock */ + if (!vmf->cow_page) { + range_lock_init(&mapping_lock, vmf->pgoff, vmf->pgoff); + range_lock(&mapping->mapping_lock, &mapping_lock); + } repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { @@ -369,8 +375,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, error = -EIO; goto unlock; } - } else { - i_mmap_lock_write(mapping); } error = get_block(inode, block, &bh, 0); @@ -390,8 +394,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto unlock; } else { - i_mmap_unlock_write(mapping); - return dax_load_hole(mapping, page, vmf); + error = dax_load_hole(mapping, page, vmf); + range_unlock(&mapping->mapping_lock, &mapping_lock); + return error; } } @@ -446,9 +451,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } - if (!page) - i_mmap_unlock_write(mapping); out: + if (!vmf->cow_page) + range_unlock(&mapping->mapping_lock, &mapping_lock); if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ @@ -460,10 +465,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (page) { unlock_page(page); page_cache_release(page); - } else { - i_mmap_unlock_write(mapping); } - goto out; } EXPORT_SYMBOL(__dax_fault); @@ -510,6 +512,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; + struct range_lock mapping_lock; unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; @@ -541,7 +544,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - i_mmap_lock_write(mapping); + range_lock_init(&mapping_lock, pgoff, pgoff + HPAGE_PMD_NR - 1); + range_lock(&mapping->mapping_lock, &mapping_lock); length = get_block(inode, block, &bh, write); if (length) return VM_FAULT_SIGBUS; @@ -568,9 +572,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * zero pages covering this hole */ if (buffer_new(&bh)) { - i_mmap_unlock_write(mapping); + range_unlock(&mapping->mapping_lock, &mapping_lock); unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_write(mapping); + range_lock(&mapping->mapping_lock, &mapping_lock); } /* @@ -624,7 +628,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - i_mmap_unlock_write(mapping); + range_unlock(&mapping->mapping_lock, &mapping_lock); return result; diff --git a/fs/inode.c b/fs/inode.c index e560535706ff..6a24144d679f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -343,6 +343,7 @@ void address_space_init_once(struct address_space *mapping) INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); init_rwsem(&mapping->i_mmap_rwsem); + range_lock_tree_init(&mapping->mapping_lock); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; diff --git a/include/linux/fs.h b/include/linux/fs.h index b6361e2e2a26..368e7208d4f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -429,6 +430,7 @@ struct address_space { atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ + struct range_lock_tree mapping_lock; /* lock_page() replacement for DAX */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ unsigned long nrshadows; /* number of shadow entries */ diff --git a/mm/memory.c b/mm/memory.c index 7f6a9563d5a6..b4898db7e4c4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2409,6 +2409,7 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { struct zap_details details; + struct range_lock mapping_lock; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -2426,10 +2427,17 @@ void unmap_mapping_range(struct address_space *mapping, if (details.last_index < details.first_index) details.last_index = ULONG_MAX; + if (IS_DAX(mapping->host)) { + /* Exclude fault under us */ + range_lock_init(&mapping_lock, hba, hba + hlen - 1); + range_lock(&mapping->mapping_lock, &mapping_lock); + } i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); + if (IS_DAX(mapping->host)) + range_unlock(&mapping->mapping_lock, &mapping_lock); } EXPORT_SYMBOL(unmap_mapping_range); @@ -2978,6 +2986,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct range_lock mapping_lock; struct page *fault_page, *new_page; struct mem_cgroup *memcg; spinlock_t *ptl; @@ -2996,6 +3006,15 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } + if (IS_DAX(file_inode(vma->vm_file))) { + /* + * The fault handler has no page to lock, so it holds + * mapping->mapping_lock to protect against truncate. + */ + range_lock_init(&mapping_lock, pgoff, pgoff); + range_unlock(&mapping->mapping_lock, &mapping_lock); + } + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; @@ -3010,12 +3029,6 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (fault_page) { unlock_page(fault_page); page_cache_release(fault_page); - } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. - */ - i_mmap_unlock_write(vma->vm_file->f_mapping); } goto uncharge_out; } @@ -3026,15 +3039,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (fault_page) { unlock_page(fault_page); page_cache_release(fault_page); - } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. - */ - i_mmap_unlock_write(vma->vm_file->f_mapping); } + if (IS_DAX(file_inode(vma->vm_file))) + range_unlock(&mapping->mapping_lock, &mapping_lock); return ret; uncharge_out: + if (IS_DAX(file_inode(vma->vm_file))) + range_unlock(&mapping->mapping_lock, &mapping_lock); mem_cgroup_cancel_charge(new_page, memcg); page_cache_release(new_page); return ret; diff --git a/mm/rmap.c b/mm/rmap.c index dcaad464aab0..3d509326d354 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -22,6 +22,7 @@ * * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem + * mapping->mapping_lock * page->flags PG_locked (lock_page) * mapping->i_mmap_rwsem * anon_vma->rwsem