diff mbox

[RFC,V3,3/6] arm: mm: implement get_user_pages_fast

Message ID 1394631623-17883-4-git-send-email-steve.capper@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Steve Capper March 12, 2014, 1:40 p.m. UTC
An implementation of get_user_pages_fast for ARM. It is based loosely
on the PowerPC implementation. We disable interrupts in the walker to
prevent the call_rcu_sched pagetable freeing code from running under
us.

We also explicitly fire an IPI in the Transparent HugePage splitting
case to prevent splits from interfering with the fast_gup walker.
As THP splits are relatively rare, this should not have a noticable
overhead.

Signed-off-by: Steve Capper <steve.capper@linaro.org>
---
 arch/arm/include/asm/pgtable-3level.h |   6 +
 arch/arm/mm/Makefile                  |   1 +
 arch/arm/mm/gup.c                     | 299 ++++++++++++++++++++++++++++++++++
 3 files changed, 306 insertions(+)
 create mode 100644 arch/arm/mm/gup.c

Comments

Peter Zijlstra March 12, 2014, 2:18 p.m. UTC | #1
On Wed, Mar 12, 2014 at 01:40:20PM +0000, Steve Capper wrote:
> +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> +			  struct page **pages)
> +{
> +	struct mm_struct *mm = current->mm;
> +	unsigned long addr, len, end;
> +	unsigned long next, flags;
> +	pgd_t *pgdp;
> +	int nr = 0;
> +
> +	start &= PAGE_MASK;
> +	addr = start;
> +	len = (unsigned long) nr_pages << PAGE_SHIFT;
> +	end = start + len;
> +
> +	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
> +					start, len)))
> +		return 0;
> +
> +	/*
> +	 * Disable interrupts, we use the nested form as we can already
> +	 * have interrupts disabled by get_futex_key.
> +	 *
> +	 * With interrupts disabled, we block page table pages from being
> +	 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
> +	 * for more details.
> +	 */
> +
> +	local_irq_save(flags);
> +	pgdp = pgd_offset(mm, addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (pgd_none(*pgdp))
> +			break;
> +		else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
> +			break;
> +	} while (pgdp++, addr = next, addr != end);
> +	local_irq_restore(flags);
> +
> +	return nr;
> +}

Since you just went through the trouble of enabling RCU pagetable
freeing, you might also replace these local_irq_save/restore with
rcu_read_{,un}lock().

Typically rcu_read_lock() is faster than disabling interrupts; but I've
no clue about ARM.
Steve Capper March 12, 2014, 4:20 p.m. UTC | #2
On 12 March 2014 14:18, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Mar 12, 2014 at 01:40:20PM +0000, Steve Capper wrote:
>> +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>> +                       struct page **pages)
>> +{
>> +     struct mm_struct *mm = current->mm;
>> +     unsigned long addr, len, end;
>> +     unsigned long next, flags;
>> +     pgd_t *pgdp;
>> +     int nr = 0;
>> +
>> +     start &= PAGE_MASK;
>> +     addr = start;
>> +     len = (unsigned long) nr_pages << PAGE_SHIFT;
>> +     end = start + len;
>> +
>> +     if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
>> +                                     start, len)))
>> +             return 0;
>> +
>> +     /*
>> +      * Disable interrupts, we use the nested form as we can already
>> +      * have interrupts disabled by get_futex_key.
>> +      *
>> +      * With interrupts disabled, we block page table pages from being
>> +      * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
>> +      * for more details.
>> +      */
>> +
>> +     local_irq_save(flags);
>> +     pgdp = pgd_offset(mm, addr);
>> +     do {
>> +             next = pgd_addr_end(addr, end);
>> +             if (pgd_none(*pgdp))
>> +                     break;
>> +             else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
>> +                     break;
>> +     } while (pgdp++, addr = next, addr != end);
>> +     local_irq_restore(flags);
>> +
>> +     return nr;
>> +}
>
> Since you just went through the trouble of enabling RCU pagetable
> freeing, you might also replace these local_irq_save/restore with
> rcu_read_{,un}lock().

Hi Peter,
This critical section also needs to block the THP splitting code. At
the moment an IPI is broadcast in pmdp_splitting_flush. I'm not sure
how to adapt that to block on an rcu_read_lock, I'll have a think.

Cheers,
Peter Zijlstra March 12, 2014, 4:30 p.m. UTC | #3
On Wed, Mar 12, 2014 at 04:20:15PM +0000, Steve Capper wrote:
> On 12 March 2014 14:18, Peter Zijlstra <peterz@infradead.org> wrote:
> > Since you just went through the trouble of enabling RCU pagetable
> > freeing, you might also replace these local_irq_save/restore with
> > rcu_read_{,un}lock().
> 
> Hi Peter,
> This critical section also needs to block the THP splitting code. At
> the moment an IPI is broadcast in pmdp_splitting_flush. I'm not sure
> how to adapt that to block on an rcu_read_lock, I'll have a think.

Ah, I've not looked at THP much at all.

Would it be sufficient to make sure to fail the pmd get_page()
equivalent early enough?
Peter Zijlstra March 12, 2014, 4:32 p.m. UTC | #4
On Wed, Mar 12, 2014 at 01:40:20PM +0000, Steve Capper wrote:
> +void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
> +			  pmd_t *pmdp)
> +{
> +	pmd_t pmd = pmd_mksplitting(*pmdp);
> +	VM_BUG_ON(address & ~PMD_MASK);
> +	set_pmd_at(vma->vm_mm, address, pmdp, pmd);
> +
> +	/* dummy IPI to serialise against fast_gup */
> +	smp_call_function(thp_splitting_flush_sync, NULL, 1);
> +}

do you really need to IPI the entire machine? Wouldn't the mm's TLB
invalidate mask be sufficient?
Steve Capper March 12, 2014, 4:41 p.m. UTC | #5
On 12 March 2014 16:32, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Mar 12, 2014 at 01:40:20PM +0000, Steve Capper wrote:
>> +void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
>> +                       pmd_t *pmdp)
>> +{
>> +     pmd_t pmd = pmd_mksplitting(*pmdp);
>> +     VM_BUG_ON(address & ~PMD_MASK);
>> +     set_pmd_at(vma->vm_mm, address, pmdp, pmd);
>> +
>> +     /* dummy IPI to serialise against fast_gup */
>> +     smp_call_function(thp_splitting_flush_sync, NULL, 1);
>> +}
>
> do you really need to IPI the entire machine? Wouldn't the mm's TLB
> invalidate mask be sufficient?

Thank you! Yes, that would be a much better idea. I'll correct this.
Steve Capper March 12, 2014, 4:42 p.m. UTC | #6
On 12 March 2014 16:30, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Mar 12, 2014 at 04:20:15PM +0000, Steve Capper wrote:
>> On 12 March 2014 14:18, Peter Zijlstra <peterz@infradead.org> wrote:
>> > Since you just went through the trouble of enabling RCU pagetable
>> > freeing, you might also replace these local_irq_save/restore with
>> > rcu_read_{,un}lock().
>>
>> Hi Peter,
>> This critical section also needs to block the THP splitting code. At
>> the moment an IPI is broadcast in pmdp_splitting_flush. I'm not sure
>> how to adapt that to block on an rcu_read_lock, I'll have a think.
>
> Ah, I've not looked at THP much at all.
>
> Would it be sufficient to make sure to fail the pmd get_page()
> equivalent early enough?

I don't think that will be enough, as we haven't locked anything. I'll
refine the IPI as per your suggestion.
Catalin Marinas March 12, 2014, 5:15 p.m. UTC | #7
On Wed, Mar 12, 2014 at 01:40:20PM +0000, Steve Capper wrote:
> An implementation of get_user_pages_fast for ARM. It is based loosely
> on the PowerPC implementation. We disable interrupts in the walker to
> prevent the call_rcu_sched pagetable freeing code from running under
> us.
> 
> We also explicitly fire an IPI in the Transparent HugePage splitting
> case to prevent splits from interfering with the fast_gup walker.
> As THP splits are relatively rare, this should not have a noticable
> overhead.
> 
> Signed-off-by: Steve Capper <steve.capper@linaro.org>
> ---
>  arch/arm/include/asm/pgtable-3level.h |   6 +
>  arch/arm/mm/Makefile                  |   1 +
>  arch/arm/mm/gup.c                     | 299 ++++++++++++++++++++++++++++++++++
>  3 files changed, 306 insertions(+)
>  create mode 100644 arch/arm/mm/gup.c

Is there anything specific to ARM in this gup.c file? Could we make it
more generic like mm/gup.c?
Steve Capper March 13, 2014, 8:03 a.m. UTC | #8
On 12 March 2014 17:15, Catalin Marinas <catalin.marinas@arm.com> wrote:
> On Wed, Mar 12, 2014 at 01:40:20PM +0000, Steve Capper wrote:
>> An implementation of get_user_pages_fast for ARM. It is based loosely
>> on the PowerPC implementation. We disable interrupts in the walker to
>> prevent the call_rcu_sched pagetable freeing code from running under
>> us.
>>
>> We also explicitly fire an IPI in the Transparent HugePage splitting
>> case to prevent splits from interfering with the fast_gup walker.
>> As THP splits are relatively rare, this should not have a noticable
>> overhead.
>>
>> Signed-off-by: Steve Capper <steve.capper@linaro.org>
>> ---
>>  arch/arm/include/asm/pgtable-3level.h |   6 +
>>  arch/arm/mm/Makefile                  |   1 +
>>  arch/arm/mm/gup.c                     | 299 ++++++++++++++++++++++++++++++++++
>>  3 files changed, 306 insertions(+)
>>  create mode 100644 arch/arm/mm/gup.c
>
> Is there anything specific to ARM in this gup.c file? Could we make it
> more generic like mm/gup.c?

Hi Catalin,
The arm and arm64 cases assume that we can read the pte's atomically,
that TLB hardware broadcasts can occur (so we have to use the
page_cache_get_speculative logic), and that hugetlb pages are
equivalent in pte layout to thp's.

Also, I took a quick look at the other architectures, and a summary of
what I found can be found in this post:
http://lists.infradead.org/pipermail/linux-arm-kernel/2014-March/239326.html

Cheers,
diff mbox

Patch

diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index b286ba9..fdc4a4f 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -226,6 +226,12 @@  static inline pte_t pte_mkspecial(pte_t pte)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)	(pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
 #define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+			  pmd_t *pmdp);
+#endif
 #endif
 
 #define PMD_BIT_FUNC(fn,op) \
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 7f39ce2..a2c4e87 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -7,6 +7,7 @@  obj-y				:= dma-mapping.o extable.o fault.o init.o \
 
 obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
 				   mmap.o pgd.o mmu.o
+obj-$(CONFIG_ARM_LPAE)		+= gup.o
 
 ifneq ($(CONFIG_MMU),y)
 obj-y				+= nommu.o
diff --git a/arch/arm/mm/gup.c b/arch/arm/mm/gup.c
new file mode 100644
index 0000000..715ab0d
--- /dev/null
+++ b/arch/arm/mm/gup.c
@@ -0,0 +1,299 @@ 
+/*
+ * arch/arm/mm/gup.c
+ *
+ * Copyright (C) 2014 Linaro Ltd.
+ *
+ * Based on arch/powerpc/mm/gup.c which is:
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+			 int write, struct page **pages, int *nr)
+{
+	pte_t *ptep, *ptem;
+	int ret = 0;
+
+	ptem = ptep = pte_offset_map(&pmd, addr);
+	do {
+		pte_t pte = ACCESS_ONCE(*ptep);
+		struct page *page;
+
+		if (!pte_valid_user(pte) || pte_special(pte)
+			|| (write && !pte_write(pte)))
+			goto pte_unmap;
+
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+
+		if (!page_cache_get_speculative(page))
+			goto pte_unmap;
+
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(page);
+			goto pte_unmap;
+		}
+
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	ret = 1;
+
+pte_unmap:
+	pte_unmap(ptem);
+	return ret;
+}
+
+static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	struct page *head, *page, *tail;
+	int refs;
+
+	if (!pmd_present(orig) || (write && !pmd_write(orig)))
+		return 0;
+
+	refs = 0;
+	head = pmd_page(orig);
+	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	tail = page;
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+		*nr -= refs;
+		while (refs--)
+			put_page(head);
+		return 0;
+	}
+
+	/*
+	 * Any tail pages need their mapcount reference taken before we
+	 * return. (This allows the THP code to bump their ref count when
+	 * they are split into base pages).
+	 */
+	while (refs--) {
+		if (PageTail(tail))
+			get_huge_page_tail(tail);
+		tail++;
+	}
+
+	return 1;
+}
+
+static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	struct page *head, *page, *tail;
+	pmd_t origpmd = __pmd(pud_val(orig));
+	int refs;
+
+	if (!pmd_present(origpmd) || (write && !pmd_write(origpmd)))
+		return 0;
+
+	refs = 0;
+	head = pmd_page(origpmd);
+	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	tail = page;
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+		*nr -= refs;
+		while (refs--)
+			put_page(head);
+		return 0;
+	}
+
+	while (refs--) {
+		if (PageTail(tail))
+			get_huge_page_tail(tail);
+		tail++;
+	}
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = ACCESS_ONCE(*pmdp);
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+			return 0;
+
+		if (unlikely(pmd_thp_or_huge(pmd))) {
+			if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+				pages, nr))
+				return 0;
+		} else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(pgdp, addr);
+	do {
+		pud_t pud = ACCESS_ONCE(*pudp);
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (pud_huge(pud)) {
+			if (!gup_huge_pud(pud, pudp, addr, next, write,
+					pages, nr))
+				return 0;
+		} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next, flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					start, len)))
+		return 0;
+
+	/*
+	 * Disable interrupts, we use the nested form as we can already
+	 * have interrupts disabled by get_futex_key.
+	 *
+	 * With interrupts disabled, we block page table pages from being
+	 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
+	 * for more details.
+	 */
+
+	local_irq_save(flags);
+	pgdp = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(*pgdp))
+			break;
+		else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
+			break;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_restore(flags);
+
+	return nr;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	int nr, ret;
+
+	start &= PAGE_MASK;
+	nr = __get_user_pages_fast(start, nr_pages, write, pages);
+	ret = nr;
+
+	if (nr < nr_pages) {
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+				     nr_pages - nr, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+static void thp_splitting_flush_sync(void *arg)
+{
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+			  pmd_t *pmdp)
+{
+	pmd_t pmd = pmd_mksplitting(*pmdp);
+	VM_BUG_ON(address & ~PMD_MASK);
+	set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+
+	/* dummy IPI to serialise against fast_gup */
+	smp_call_function(thp_splitting_flush_sync, NULL, 1);
+}
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */