diff mbox series

[v3,3/5] mm, thp: introduce FOLL_SPLIT_PMD

Message ID 20190612220320.2223898-4-songliubraving@fb.com (mailing list archive)
State New, archived
Headers show
Series THP aware uprobe | expand

Commit Message

Song Liu June 12, 2019, 10:03 p.m. UTC
This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
page stays as-is.

FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
but would switch back to huge page and huge pmd on. One of such example
is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/mm.h |  1 +
 mm/gup.c           | 38 +++++++++++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 3 deletions(-)

Comments

Kirill A . Shutemov June 13, 2019, 12:57 p.m. UTC | #1
On Wed, Jun 12, 2019 at 03:03:17PM -0700, Song Liu wrote:
> This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
> FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
> page stays as-is.
> 
> FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
> but would switch back to huge page and huge pmd on. One of such example
> is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  include/linux/mm.h |  1 +
>  mm/gup.c           | 38 +++++++++++++++++++++++++++++++++++---
>  2 files changed, 36 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0ab8c7d84cd0..e605acc4fc81 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2642,6 +2642,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>  #define FOLL_COW	0x4000	/* internal GUP flag */
>  #define FOLL_ANON	0x8000	/* don't do file mappings */
>  #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
> +#define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
>  
>  /*
>   * NOTE on FOLL_LONGTERM:
> diff --git a/mm/gup.c b/mm/gup.c
> index ddde097cf9e4..3d05bddb56c9 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>  		spin_unlock(ptl);
>  		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>  	}
> -	if (flags & FOLL_SPLIT) {
> +	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>  		int ret;
>  		page = pmd_page(*pmd);
>  		if (is_huge_zero_page(page)) {
> @@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>  			split_huge_pmd(vma, pmd, address);
>  			if (pmd_trans_unstable(pmd))
>  				ret = -EBUSY;
> -		} else {
> +		} else if (flags & FOLL_SPLIT) {
>  			if (unlikely(!try_get_page(page))) {
>  				spin_unlock(ptl);
>  				return ERR_PTR(-ENOMEM);
> @@ -419,8 +419,40 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>  			put_page(page);
>  			if (pmd_none(*pmd))
>  				return no_page_table(vma, flags);
> -		}
> +		} else {  /* flags & FOLL_SPLIT_PMD */
> +			unsigned long addr;
> +			pgprot_t prot;
> +			pte_t *pte;
> +			int i;
> +
> +			spin_unlock(ptl);
> +			split_huge_pmd(vma, pmd, address);

All the code below is only relevant for file-backed THP. It will break for
anon-THP.

And I'm not convinced that it belongs here at all. User requested PMD
split and it is done after split_huge_pmd(). The rest can be handled by
the caller as needed.

> +			lock_page(page);
> +			pte = get_locked_pte(mm, address, &ptl);
> +			if (!pte) {
> +				unlock_page(page);
> +				return no_page_table(vma, flags);

Or should it be -ENOMEM?

> +			}
>  
> +			/* get refcount for every small page */
> +			page_ref_add(page, HPAGE_PMD_NR);
> +
> +			prot = READ_ONCE(vma->vm_page_prot);
> +			for (i = 0, addr = address & PMD_MASK;
> +			     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> +				struct page *p = page + i;
> +
> +				pte = pte_offset_map(pmd, addr);
> +				VM_BUG_ON(!pte_none(*pte));
> +				set_pte_at(mm, addr, pte, mk_pte(p, prot));
> +				page_add_file_rmap(p, false);
> +			}
> +
> +			spin_unlock(ptl);
> +			unlock_page(page);
> +			add_mm_counter(mm, mm_counter_file(page), HPAGE_PMD_NR);
> +			ret = 0;
> +		}
>  		return ret ? ERR_PTR(ret) :
>  			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>  	}
> -- 
> 2.17.1
>
Song Liu June 13, 2019, 1:57 p.m. UTC | #2
> On Jun 13, 2019, at 5:57 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> 
> On Wed, Jun 12, 2019 at 03:03:17PM -0700, Song Liu wrote:
>> This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
>> FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
>> page stays as-is.
>> 
>> FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
>> but would switch back to huge page and huge pmd on. One of such example
>> is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>> include/linux/mm.h |  1 +
>> mm/gup.c           | 38 +++++++++++++++++++++++++++++++++++---
>> 2 files changed, 36 insertions(+), 3 deletions(-)
>> 
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 0ab8c7d84cd0..e605acc4fc81 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -2642,6 +2642,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>> #define FOLL_COW	0x4000	/* internal GUP flag */
>> #define FOLL_ANON	0x8000	/* don't do file mappings */
>> #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
>> +#define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
>> 
>> /*
>>  * NOTE on FOLL_LONGTERM:
>> diff --git a/mm/gup.c b/mm/gup.c
>> index ddde097cf9e4..3d05bddb56c9 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> 		spin_unlock(ptl);
>> 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>> 	}
>> -	if (flags & FOLL_SPLIT) {
>> +	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>> 		int ret;
>> 		page = pmd_page(*pmd);
>> 		if (is_huge_zero_page(page)) {
>> @@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> 			split_huge_pmd(vma, pmd, address);
>> 			if (pmd_trans_unstable(pmd))
>> 				ret = -EBUSY;
>> -		} else {
>> +		} else if (flags & FOLL_SPLIT) {
>> 			if (unlikely(!try_get_page(page))) {
>> 				spin_unlock(ptl);
>> 				return ERR_PTR(-ENOMEM);
>> @@ -419,8 +419,40 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> 			put_page(page);
>> 			if (pmd_none(*pmd))
>> 				return no_page_table(vma, flags);
>> -		}
>> +		} else {  /* flags & FOLL_SPLIT_PMD */
>> +			unsigned long addr;
>> +			pgprot_t prot;
>> +			pte_t *pte;
>> +			int i;
>> +
>> +			spin_unlock(ptl);
>> +			split_huge_pmd(vma, pmd, address);
> 
> All the code below is only relevant for file-backed THP. It will break for
> anon-THP.

Oh, yes, that makes sense. 

> 
> And I'm not convinced that it belongs here at all. User requested PMD
> split and it is done after split_huge_pmd(). The rest can be handled by
> the caller as needed.

I put this part here because split_huge_pmd() for file-backed THP is
not really done after split_huge_pmd(). And I would like it done before
calling follow_page_pte() below. Maybe we can still do them here, just 
for file-backed THPs?

If we would move it, shall we move to callers of follow_page_mask()? 
In that case, we will probably end up with similar code in two places:
__get_user_pages() and follow_page(). 

Did I get this right?

> 
>> +			lock_page(page);
>> +			pte = get_locked_pte(mm, address, &ptl);
>> +			if (!pte) {
>> +				unlock_page(page);
>> +				return no_page_table(vma, flags);
> 
> Or should it be -ENOMEM?

Yeah, ENOMEM is more accurate. 

Thanks,
Song

> 
>> +			}
>> 
>> +			/* get refcount for every small page */
>> +			page_ref_add(page, HPAGE_PMD_NR);
>> +
>> +			prot = READ_ONCE(vma->vm_page_prot);
>> +			for (i = 0, addr = address & PMD_MASK;
>> +			     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>> +				struct page *p = page + i;
>> +
>> +				pte = pte_offset_map(pmd, addr);
>> +				VM_BUG_ON(!pte_none(*pte));
>> +				set_pte_at(mm, addr, pte, mk_pte(p, prot));
>> +				page_add_file_rmap(p, false);
>> +			}
>> +
>> +			spin_unlock(ptl);
>> +			unlock_page(page);
>> +			add_mm_counter(mm, mm_counter_file(page), HPAGE_PMD_NR);
>> +			ret = 0;
>> +		}
>> 		return ret ? ERR_PTR(ret) :
>> 			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>> 	}
>> -- 
>> 2.17.1
>> 
> 
> -- 
> Kirill A. Shutemov
Kirill A . Shutemov June 13, 2019, 2:16 p.m. UTC | #3
On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote:
> > And I'm not convinced that it belongs here at all. User requested PMD
> > split and it is done after split_huge_pmd(). The rest can be handled by
> > the caller as needed.
> 
> I put this part here because split_huge_pmd() for file-backed THP is
> not really done after split_huge_pmd(). And I would like it done before
> calling follow_page_pte() below. Maybe we can still do them here, just 
> for file-backed THPs?
> 
> If we would move it, shall we move to callers of follow_page_mask()? 
> In that case, we will probably end up with similar code in two places:
> __get_user_pages() and follow_page(). 
> 
> Did I get this right?

Would it be enough to replace pte_offset_map_lock() in follow_page_pte()
with pte_alloc_map_lock()?

This will leave bunch not populated PTE entries, but it is fine: they will
be populated on the next access to them.
Song Liu June 13, 2019, 3:03 p.m. UTC | #4
> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> 
> On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote:
>>> And I'm not convinced that it belongs here at all. User requested PMD
>>> split and it is done after split_huge_pmd(). The rest can be handled by
>>> the caller as needed.
>> 
>> I put this part here because split_huge_pmd() for file-backed THP is
>> not really done after split_huge_pmd(). And I would like it done before
>> calling follow_page_pte() below. Maybe we can still do them here, just 
>> for file-backed THPs?
>> 
>> If we would move it, shall we move to callers of follow_page_mask()? 
>> In that case, we will probably end up with similar code in two places:
>> __get_user_pages() and follow_page(). 
>> 
>> Did I get this right?
> 
> Would it be enough to replace pte_offset_map_lock() in follow_page_pte()
> with pte_alloc_map_lock()?

This is similar to my previous version:

+		} else {  /* flags & FOLL_SPLIT_PMD */
+			pte_t *pte;
+			spin_unlock(ptl);
+			split_huge_pmd(vma, pmd, address);
+			pte = get_locked_pte(mm, address, &ptl);
+			if (!pte)
+				return no_page_table(vma, flags);
+			spin_unlock(ptl);
+			ret = 0;
+		}

I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). 
What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)?


> This will leave bunch not populated PTE entries, but it is fine: they will
> be populated on the next access to them.

We need to handle page fault during next access, right? Since we already
allocated everything, we can just populate the PTE entries and saves a
lot of page faults (assuming we will access them later). 

Thanks,
Song

> 
> -- 
> Kirill A. Shutemov
kirill.shutemov@linux.intel.com June 13, 2019, 3:14 p.m. UTC | #5
On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote:
> 
> 
> > On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > 
> > On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote:
> >>> And I'm not convinced that it belongs here at all. User requested PMD
> >>> split and it is done after split_huge_pmd(). The rest can be handled by
> >>> the caller as needed.
> >> 
> >> I put this part here because split_huge_pmd() for file-backed THP is
> >> not really done after split_huge_pmd(). And I would like it done before
> >> calling follow_page_pte() below. Maybe we can still do them here, just 
> >> for file-backed THPs?
> >> 
> >> If we would move it, shall we move to callers of follow_page_mask()? 
> >> In that case, we will probably end up with similar code in two places:
> >> __get_user_pages() and follow_page(). 
> >> 
> >> Did I get this right?
> > 
> > Would it be enough to replace pte_offset_map_lock() in follow_page_pte()
> > with pte_alloc_map_lock()?
> 
> This is similar to my previous version:
> 
> +		} else {  /* flags & FOLL_SPLIT_PMD */
> +			pte_t *pte;
> +			spin_unlock(ptl);
> +			split_huge_pmd(vma, pmd, address);
> +			pte = get_locked_pte(mm, address, &ptl);
> +			if (!pte)
> +				return no_page_table(vma, flags);
> +			spin_unlock(ptl);
> +			ret = 0;
> +		}
> 
> I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). 
> What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)?

It's additional lock-unlock cycle and few more lines of code...

> > This will leave bunch not populated PTE entries, but it is fine: they will
> > be populated on the next access to them.
> 
> We need to handle page fault during next access, right? Since we already
> allocated everything, we can just populate the PTE entries and saves a
> lot of page faults (assuming we will access them later). 

Not a lot due to faultaround and they may never happen, but you need to
tear down the mapping any way.
Song Liu June 13, 2019, 3:24 p.m. UTC | #6
> On Jun 13, 2019, at 8:14 AM, Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote:
> 
> On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote:
>> 
>> 
>>> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>>> 
>>> On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote:
>>>>> And I'm not convinced that it belongs here at all. User requested PMD
>>>>> split and it is done after split_huge_pmd(). The rest can be handled by
>>>>> the caller as needed.
>>>> 
>>>> I put this part here because split_huge_pmd() for file-backed THP is
>>>> not really done after split_huge_pmd(). And I would like it done before
>>>> calling follow_page_pte() below. Maybe we can still do them here, just 
>>>> for file-backed THPs?
>>>> 
>>>> If we would move it, shall we move to callers of follow_page_mask()? 
>>>> In that case, we will probably end up with similar code in two places:
>>>> __get_user_pages() and follow_page(). 
>>>> 
>>>> Did I get this right?
>>> 
>>> Would it be enough to replace pte_offset_map_lock() in follow_page_pte()
>>> with pte_alloc_map_lock()?
>> 
>> This is similar to my previous version:
>> 
>> +		} else {  /* flags & FOLL_SPLIT_PMD */
>> +			pte_t *pte;
>> +			spin_unlock(ptl);
>> +			split_huge_pmd(vma, pmd, address);
>> +			pte = get_locked_pte(mm, address, &ptl);
>> +			if (!pte)
>> +				return no_page_table(vma, flags);
>> +			spin_unlock(ptl);
>> +			ret = 0;
>> +		}
>> 
>> I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). 
>> What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)?
> 
> It's additional lock-unlock cycle and few more lines of code...
> 
>>> This will leave bunch not populated PTE entries, but it is fine: they will
>>> be populated on the next access to them.
>> 
>> We need to handle page fault during next access, right? Since we already
>> allocated everything, we can just populate the PTE entries and saves a
>> lot of page faults (assuming we will access them later). 
> 
> Not a lot due to faultaround and they may never happen, but you need to
> tear down the mapping any way.

I see. Let me try this way. 

Thanks,
Song

> 
> -- 
> Kirill A. Shutemov
Song Liu June 13, 2019, 4:47 p.m. UTC | #7
Hi Kirill,

> On Jun 13, 2019, at 8:24 AM, Song Liu <songliubraving@fb.com> wrote:
> 
> 
> 
>> On Jun 13, 2019, at 8:14 AM, Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote:
>> 
>> On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote:
>>> 
>>> 
>>>> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>>>> 
>>>> On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote:
>>>>>> And I'm not convinced that it belongs here at all. User requested PMD
>>>>>> split and it is done after split_huge_pmd(). The rest can be handled by
>>>>>> the caller as needed.
>>>>> 
>>>>> I put this part here because split_huge_pmd() for file-backed THP is
>>>>> not really done after split_huge_pmd(). And I would like it done before
>>>>> calling follow_page_pte() below. Maybe we can still do them here, just 
>>>>> for file-backed THPs?
>>>>> 
>>>>> If we would move it, shall we move to callers of follow_page_mask()? 
>>>>> In that case, we will probably end up with similar code in two places:
>>>>> __get_user_pages() and follow_page(). 
>>>>> 
>>>>> Did I get this right?
>>>> 
>>>> Would it be enough to replace pte_offset_map_lock() in follow_page_pte()
>>>> with pte_alloc_map_lock()?
>>> 
>>> This is similar to my previous version:
>>> 
>>> +		} else {  /* flags & FOLL_SPLIT_PMD */
>>> +			pte_t *pte;
>>> +			spin_unlock(ptl);
>>> +			split_huge_pmd(vma, pmd, address);
>>> +			pte = get_locked_pte(mm, address, &ptl);
>>> +			if (!pte)
>>> +				return no_page_table(vma, flags);
>>> +			spin_unlock(ptl);
>>> +			ret = 0;
>>> +		}
>>> 
>>> I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). 
>>> What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)?
>> 
>> It's additional lock-unlock cycle and few more lines of code...
>> 
>>>> This will leave bunch not populated PTE entries, but it is fine: they will
>>>> be populated on the next access to them.
>>> 
>>> We need to handle page fault during next access, right? Since we already
>>> allocated everything, we can just populate the PTE entries and saves a
>>> lot of page faults (assuming we will access them later). 
>> 
>> Not a lot due to faultaround and they may never happen, but you need to
>> tear down the mapping any way.
> 
> I see. Let me try this way. 
> 
> Thanks,
> Song

To make sure I understand your suggestions. Here is what I got:

diff --git c/mm/gup.c w/mm/gup.c
index ddde097cf9e4..85e6f46fd925 100644
--- c/mm/gup.c
+++ w/mm/gup.c
@@ -197,7 +197,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
        if (unlikely(pmd_bad(*pmd)))
                return no_page_table(vma, flags);

-       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       ptep = pte_alloc_map_lock(mm, pmd, address, &ptl);
+       if (!ptep)
+               return ERR_PTR(-ENOMEM);
+
        pte = *ptep;
        if (!pte_present(pte)) {
                swp_entry_t entry;
@@ -398,7 +401,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                spin_unlock(ptl);
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
-       if (flags & FOLL_SPLIT) {
+       if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
                int ret;
                page = pmd_page(*pmd);
                if (is_huge_zero_page(page)) {
@@ -407,7 +410,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                        split_huge_pmd(vma, pmd, address);
                        if (pmd_trans_unstable(pmd))
                                ret = -EBUSY;
-               } else {
+               } else if (flags & FOLL_SPLIT) {
                        if (unlikely(!try_get_page(page))) {
                                spin_unlock(ptl);
                                return ERR_PTR(-ENOMEM);
@@ -419,6 +422,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                        put_page(page);
                        if (pmd_none(*pmd))
                                return no_page_table(vma, flags);
+               } else {  /* flags & FOLL_SPLIT_PMD */
+                       spin_unlock(ptl);
+                       split_huge_pmd(vma, pmd, address);
+                       ret = 0;
                }

                return ret ? ERR_PTR(ret) :
                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);


This version doesn't work as-is, because it returns at the first check:

        if (unlikely(pmd_bad(*pmd)))
                return no_page_table(vma, flags);

Did I misunderstand anything here?

Thanks,
Song


> 
>> 
>> -- 
>> Kirill A. Shutemov
Song Liu June 13, 2019, 5:42 p.m. UTC | #8
> On Jun 13, 2019, at 9:47 AM, Song Liu <songliubraving@fb.com> wrote:
> 
> Hi Kirill,
> 
>> On Jun 13, 2019, at 8:24 AM, Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Jun 13, 2019, at 8:14 AM, Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote:
>>> 
>>> On Thu, Jun 13, 2019 at 03:03:01PM +0000, Song Liu wrote:
>>>> 
>>>> 
>>>>> On Jun 13, 2019, at 7:16 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
>>>>> 
>>>>> On Thu, Jun 13, 2019 at 01:57:30PM +0000, Song Liu wrote:
>>>>>>> And I'm not convinced that it belongs here at all. User requested PMD
>>>>>>> split and it is done after split_huge_pmd(). The rest can be handled by
>>>>>>> the caller as needed.
>>>>>> 
>>>>>> I put this part here because split_huge_pmd() for file-backed THP is
>>>>>> not really done after split_huge_pmd(). And I would like it done before
>>>>>> calling follow_page_pte() below. Maybe we can still do them here, just 
>>>>>> for file-backed THPs?
>>>>>> 
>>>>>> If we would move it, shall we move to callers of follow_page_mask()? 
>>>>>> In that case, we will probably end up with similar code in two places:
>>>>>> __get_user_pages() and follow_page(). 
>>>>>> 
>>>>>> Did I get this right?
>>>>> 
>>>>> Would it be enough to replace pte_offset_map_lock() in follow_page_pte()
>>>>> with pte_alloc_map_lock()?
>>>> 
>>>> This is similar to my previous version:
>>>> 
>>>> +		} else {  /* flags & FOLL_SPLIT_PMD */
>>>> +			pte_t *pte;
>>>> +			spin_unlock(ptl);
>>>> +			split_huge_pmd(vma, pmd, address);
>>>> +			pte = get_locked_pte(mm, address, &ptl);
>>>> +			if (!pte)
>>>> +				return no_page_table(vma, flags);
>>>> +			spin_unlock(ptl);
>>>> +			ret = 0;
>>>> +		}
>>>> 
>>>> I think this is cleaner than use pte_alloc_map_lock() in follow_page_pte(). 
>>>> What's your thought on these two versions (^^^ vs. pte_alloc_map_lock)?
>>> 
>>> It's additional lock-unlock cycle and few more lines of code...
>>> 
>>>>> This will leave bunch not populated PTE entries, but it is fine: they will
>>>>> be populated on the next access to them.
>>>> 
>>>> We need to handle page fault during next access, right? Since we already
>>>> allocated everything, we can just populate the PTE entries and saves a
>>>> lot of page faults (assuming we will access them later). 
>>> 
>>> Not a lot due to faultaround and they may never happen, but you need to
>>> tear down the mapping any way.
>> 
>> I see. Let me try this way. 
>> 
>> Thanks,
>> Song
> 
> To make sure I understand your suggestions. Here is what I got:
> 
> diff --git c/mm/gup.c w/mm/gup.c
> index ddde097cf9e4..85e6f46fd925 100644
> --- c/mm/gup.c
> +++ w/mm/gup.c
> @@ -197,7 +197,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>        if (unlikely(pmd_bad(*pmd)))
>                return no_page_table(vma, flags);
> 
> -       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
> +       ptep = pte_alloc_map_lock(mm, pmd, address, &ptl);
> +       if (!ptep)
> +               return ERR_PTR(-ENOMEM);
> +
>        pte = *ptep;
>        if (!pte_present(pte)) {
>                swp_entry_t entry;
> @@ -398,7 +401,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>                spin_unlock(ptl);
>                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>        }
> -       if (flags & FOLL_SPLIT) {
> +       if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>                int ret;
>                page = pmd_page(*pmd);
>                if (is_huge_zero_page(page)) {
> @@ -407,7 +410,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>                        split_huge_pmd(vma, pmd, address);
>                        if (pmd_trans_unstable(pmd))
>                                ret = -EBUSY;
> -               } else {
> +               } else if (flags & FOLL_SPLIT) {
>                        if (unlikely(!try_get_page(page))) {
>                                spin_unlock(ptl);
>                                return ERR_PTR(-ENOMEM);
> @@ -419,6 +422,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>                        put_page(page);
>                        if (pmd_none(*pmd))
>                                return no_page_table(vma, flags);
> +               } else {  /* flags & FOLL_SPLIT_PMD */
> +                       spin_unlock(ptl);
> +                       split_huge_pmd(vma, pmd, address);
> +                       ret = 0;
>                }
> 
>                return ret ? ERR_PTR(ret) :
>                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
> 
> 
> This version doesn't work as-is, because it returns at the first check:
> 
>        if (unlikely(pmd_bad(*pmd)))
>                return no_page_table(vma, flags);
> 
> Did I misunderstand anything here?
> 
> Thanks,
> Song

I guess this would be the best. It _is_ a lot simpler. 

diff --git c/mm/gup.c w/mm/gup.c
index ddde097cf9e4..0cd3ce599f41 100644
--- c/mm/gup.c
+++ w/mm/gup.c
@@ -398,7 +398,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                spin_unlock(ptl);
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
-       if (flags & FOLL_SPLIT) {
+       if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
                int ret;
                page = pmd_page(*pmd);
                if (is_huge_zero_page(page)) {
@@ -407,7 +407,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                        split_huge_pmd(vma, pmd, address);
                        if (pmd_trans_unstable(pmd))
                                ret = -EBUSY;
-               } else {
+               } else if (flags & FOLL_SPLIT) {
                        if (unlikely(!try_get_page(page))) {
                                spin_unlock(ptl);
                                return ERR_PTR(-ENOMEM);
@@ -419,6 +419,11 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                        put_page(page);
                        if (pmd_none(*pmd))
                                return no_page_table(vma, flags);
+               } else {  /* flags & FOLL_SPLIT_PMD */
+                       spin_unlock(ptl);
+                       ret = 0;
+                       split_huge_pmd(vma, pmd, address);
+                       pte_alloc(mm, pmd);
                }

Thanks again for the suggestions. I will send v4 soon. 

Song


> 
> 
>> 
>>> 
>>> -- 
>>> Kirill A. Shutemov
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ab8c7d84cd0..e605acc4fc81 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2642,6 +2642,7 @@  struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
 #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
 
 /*
  * NOTE on FOLL_LONGTERM:
diff --git a/mm/gup.c b/mm/gup.c
index ddde097cf9e4..3d05bddb56c9 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -398,7 +398,7 @@  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 		spin_unlock(ptl);
 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 	}
-	if (flags & FOLL_SPLIT) {
+	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
 		int ret;
 		page = pmd_page(*pmd);
 		if (is_huge_zero_page(page)) {
@@ -407,7 +407,7 @@  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 			split_huge_pmd(vma, pmd, address);
 			if (pmd_trans_unstable(pmd))
 				ret = -EBUSY;
-		} else {
+		} else if (flags & FOLL_SPLIT) {
 			if (unlikely(!try_get_page(page))) {
 				spin_unlock(ptl);
 				return ERR_PTR(-ENOMEM);
@@ -419,8 +419,40 @@  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 			put_page(page);
 			if (pmd_none(*pmd))
 				return no_page_table(vma, flags);
-		}
+		} else {  /* flags & FOLL_SPLIT_PMD */
+			unsigned long addr;
+			pgprot_t prot;
+			pte_t *pte;
+			int i;
+
+			spin_unlock(ptl);
+			split_huge_pmd(vma, pmd, address);
+			lock_page(page);
+			pte = get_locked_pte(mm, address, &ptl);
+			if (!pte) {
+				unlock_page(page);
+				return no_page_table(vma, flags);
+			}
 
+			/* get refcount for every small page */
+			page_ref_add(page, HPAGE_PMD_NR);
+
+			prot = READ_ONCE(vma->vm_page_prot);
+			for (i = 0, addr = address & PMD_MASK;
+			     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+				struct page *p = page + i;
+
+				pte = pte_offset_map(pmd, addr);
+				VM_BUG_ON(!pte_none(*pte));
+				set_pte_at(mm, addr, pte, mk_pte(p, prot));
+				page_add_file_rmap(p, false);
+			}
+
+			spin_unlock(ptl);
+			unlock_page(page);
+			add_mm_counter(mm, mm_counter_file(page), HPAGE_PMD_NR);
+			ret = 0;
+		}
 		return ret ? ERR_PTR(ret) :
 			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 	}