diff mbox series

[v12,3/6] mm, thp: introduce FOLL_SPLIT_PMD

Message ID 20190807233729.3899352-4-songliubraving@fb.com (mailing list archive)
State New, archived
Headers show
Series THP aware uprobe | expand

Commit Message

Song Liu Aug. 7, 2019, 11:37 p.m. UTC
This patches introduces a new foll_flag: FOLL_SPLIT_PMD. As the name says
FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge
page stays as-is.

FOLL_SPLIT_PMD is useful for cases where we need to use regular pages,
but would switch back to huge page and huge pmd on. One of such example
is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe.

Cc: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/mm.h | 1 +
 mm/gup.c           | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

Comments

Oleg Nesterov Aug. 8, 2019, 4:37 p.m. UTC | #1
On 08/07, Song Liu wrote:
>
> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>  		spin_unlock(ptl);
>  		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>  	}
> -	if (flags & FOLL_SPLIT) {
> +	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>  		int ret;
>  		page = pmd_page(*pmd);
>  		if (is_huge_zero_page(page)) {
> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>  			split_huge_pmd(vma, pmd, address);
>  			if (pmd_trans_unstable(pmd))
>  				ret = -EBUSY;
> -		} else {
> +		} else if (flags & FOLL_SPLIT) {
>  			if (unlikely(!try_get_page(page))) {
>  				spin_unlock(ptl);
>  				return ERR_PTR(-ENOMEM);
> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>  			put_page(page);
>  			if (pmd_none(*pmd))
>  				return no_page_table(vma, flags);
> +		} else {  /* flags & FOLL_SPLIT_PMD */
> +			spin_unlock(ptl);
> +			split_huge_pmd(vma, pmd, address);
> +			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
>  		}

Can't resist, let me repeat that I do not like this patch because imo
it complicates this code for no reason.

But I can't insist and of course I could miss something.

Oleg.
Song Liu Aug. 8, 2019, 5:16 p.m. UTC | #2
> On Aug 8, 2019, at 9:37 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> 
> On 08/07, Song Liu wrote:
>> 
>> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> 		spin_unlock(ptl);
>> 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>> 	}
>> -	if (flags & FOLL_SPLIT) {
>> +	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>> 		int ret;
>> 		page = pmd_page(*pmd);
>> 		if (is_huge_zero_page(page)) {
>> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> 			split_huge_pmd(vma, pmd, address);
>> 			if (pmd_trans_unstable(pmd))
>> 				ret = -EBUSY;
>> -		} else {
>> +		} else if (flags & FOLL_SPLIT) {
>> 			if (unlikely(!try_get_page(page))) {
>> 				spin_unlock(ptl);
>> 				return ERR_PTR(-ENOMEM);
>> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>> 			put_page(page);
>> 			if (pmd_none(*pmd))
>> 				return no_page_table(vma, flags);
>> +		} else {  /* flags & FOLL_SPLIT_PMD */
>> +			spin_unlock(ptl);
>> +			split_huge_pmd(vma, pmd, address);
>> +			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
>> 		}
> 
> Can't resist, let me repeat that I do not like this patch because imo
> it complicates this code for no reason.

Personally, I don't think this is more complicated than your version. 
This patch is safe as it doesn't change any code for is_huge_zero_page() 
case. 

Also, if some code calls follow_pmd_mask() with flags contains both 
FOLL_SPLIT and FOLL_SPLIT_PMD, we should honor FOLL_SPLIT and split the
huge page. Of course, there is no code that sets both flags.

Does this resolve your concern here?

Thanks,
Song
Oleg Nesterov Aug. 9, 2019, 4:35 p.m. UTC | #3
On 08/08, Song Liu wrote:
>
> > On Aug 8, 2019, at 9:37 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > On 08/07, Song Liu wrote:
> >>
> >> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> >> 		spin_unlock(ptl);
> >> 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
> >> 	}
> >> -	if (flags & FOLL_SPLIT) {
> >> +	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
> >> 		int ret;
> >> 		page = pmd_page(*pmd);
> >> 		if (is_huge_zero_page(page)) {
> >> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> >> 			split_huge_pmd(vma, pmd, address);
> >> 			if (pmd_trans_unstable(pmd))
> >> 				ret = -EBUSY;
> >> -		} else {
> >> +		} else if (flags & FOLL_SPLIT) {
> >> 			if (unlikely(!try_get_page(page))) {
> >> 				spin_unlock(ptl);
> >> 				return ERR_PTR(-ENOMEM);
> >> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
> >> 			put_page(page);
> >> 			if (pmd_none(*pmd))
> >> 				return no_page_table(vma, flags);
> >> +		} else {  /* flags & FOLL_SPLIT_PMD */
> >> +			spin_unlock(ptl);
> >> +			split_huge_pmd(vma, pmd, address);
> >> +			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
> >> 		}
> >
> > Can't resist, let me repeat that I do not like this patch because imo
> > it complicates this code for no reason.
>
> Personally, I don't think this is more complicated than your version.

I do, but of course this is subjective.

> Also, if some code calls follow_pmd_mask() with flags contains both
> FOLL_SPLIT and FOLL_SPLIT_PMD, we should honor FOLL_SPLIT and split the
> huge page.

Heh. why not other way around?

> Of course, there is no code that sets both flags.

and of course, nobody should ever pass both FOLL_SPLIT and FOLL_SPLIT_PMD,
perhaps this deserves a warning.

Not to mention that it would be nice to kill FOLL_SPLIT which has a single
user, but this is another story.

Oleg.
Song Liu Aug. 9, 2019, 4:50 p.m. UTC | #4
> On Aug 9, 2019, at 9:35 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> 
> On 08/08, Song Liu wrote:
>> 
>>> On Aug 8, 2019, at 9:37 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>>> 
>>> On 08/07, Song Liu wrote:
>>>> 
>>>> @@ -399,7 +399,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>>>> 		spin_unlock(ptl);
>>>> 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
>>>> 	}
>>>> -	if (flags & FOLL_SPLIT) {
>>>> +	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>>>> 		int ret;
>>>> 		page = pmd_page(*pmd);
>>>> 		if (is_huge_zero_page(page)) {
>>>> @@ -408,7 +408,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>>>> 			split_huge_pmd(vma, pmd, address);
>>>> 			if (pmd_trans_unstable(pmd))
>>>> 				ret = -EBUSY;
>>>> -		} else {
>>>> +		} else if (flags & FOLL_SPLIT) {
>>>> 			if (unlikely(!try_get_page(page))) {
>>>> 				spin_unlock(ptl);
>>>> 				return ERR_PTR(-ENOMEM);
>>>> @@ -420,6 +420,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>>>> 			put_page(page);
>>>> 			if (pmd_none(*pmd))
>>>> 				return no_page_table(vma, flags);
>>>> +		} else {  /* flags & FOLL_SPLIT_PMD */
>>>> +			spin_unlock(ptl);
>>>> +			split_huge_pmd(vma, pmd, address);
>>>> +			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
>>>> 		}
>>> 
>>> Can't resist, let me repeat that I do not like this patch because imo
>>> it complicates this code for no reason.
>> 
>> Personally, I don't think this is more complicated than your version.
> 
> I do, but of course this is subjective.
> 
>> Also, if some code calls follow_pmd_mask() with flags contains both
>> FOLL_SPLIT and FOLL_SPLIT_PMD, we should honor FOLL_SPLIT and split the
>> huge page.
> 
> Heh. why not other way around?

Because FOLL_SPLIT splits both the page and the pmd. FOLL_SPLIT_PMD 
only splits the pmd, so it is a subset of FOLL_SPLIT. When the user
sets both, we should split both the page and the pmd. 

Thanks,
Song
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f189176dabed..74db879711eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2614,6 +2614,7 @@  struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
 #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
 
 /*
  * NOTE on FOLL_LONGTERM:
diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..c20afe800b3f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -399,7 +399,7 @@  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 		spin_unlock(ptl);
 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 	}
-	if (flags & FOLL_SPLIT) {
+	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
 		int ret;
 		page = pmd_page(*pmd);
 		if (is_huge_zero_page(page)) {
@@ -408,7 +408,7 @@  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 			split_huge_pmd(vma, pmd, address);
 			if (pmd_trans_unstable(pmd))
 				ret = -EBUSY;
-		} else {
+		} else if (flags & FOLL_SPLIT) {
 			if (unlikely(!try_get_page(page))) {
 				spin_unlock(ptl);
 				return ERR_PTR(-ENOMEM);
@@ -420,6 +420,10 @@  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 			put_page(page);
 			if (pmd_none(*pmd))
 				return no_page_table(vma, flags);
+		} else {  /* flags & FOLL_SPLIT_PMD */
+			spin_unlock(ptl);
+			split_huge_pmd(vma, pmd, address);
+			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
 		}
 
 		return ret ? ERR_PTR(ret) :