diff mbox

[RFC,v2,2/2] mm: mmap: zap pages with read mmap_sem for large mapping

Message ID 1529364856-49589-3-git-send-email-yang.shi@linux.alibaba.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yang Shi June 18, 2018, 11:34 p.m. UTC
When running some mmap/munmap scalability tests with large memory (i.e.
> 300GB), the below hung task issue may happen occasionally.

INFO: task ps:14018 blocked for more than 120 seconds.
       Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
message.
 ps              D    0 14018      1 0x00000004
  ffff885582f84000 ffff885e8682f000 ffff880972943000 ffff885ebf499bc0
  ffff8828ee120000 ffffc900349bfca8 ffffffff817154d0 0000000000000040
  00ffffff812f872a ffff885ebf499bc0 024000d000948300 ffff880972943000
 Call Trace:
  [<ffffffff817154d0>] ? __schedule+0x250/0x730
  [<ffffffff817159e6>] schedule+0x36/0x80
  [<ffffffff81718560>] rwsem_down_read_failed+0xf0/0x150
  [<ffffffff81390a28>] call_rwsem_down_read_failed+0x18/0x30
  [<ffffffff81717db0>] down_read+0x20/0x40
  [<ffffffff812b9439>] proc_pid_cmdline_read+0xd9/0x4e0
  [<ffffffff81253c95>] ? do_filp_open+0xa5/0x100
  [<ffffffff81241d87>] __vfs_read+0x37/0x150
  [<ffffffff812f824b>] ? security_file_permission+0x9b/0xc0
  [<ffffffff81242266>] vfs_read+0x96/0x130
  [<ffffffff812437b5>] SyS_read+0x55/0xc0
  [<ffffffff8171a6da>] entry_SYSCALL_64_fastpath+0x1a/0xc5

It is because munmap holds mmap_sem from very beginning to all the way
down to the end, and doesn't release it in the middle. When unmapping
large mapping, it may take long time (take ~18 seconds to unmap 320GB
mapping with every single page mapped on an idle machine).

Zapping pages is the most time consuming part, according to the
suggestion from Michal Hock [1], zapping pages can be done with holding
read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
mmap_sem to manipulate vmas.

Define large mapping size thresh as PUD size or 1GB, just zap pages with
read mmap_sem for mappings which are >= thresh value.

If the vma has VM_LOCKED | VM_HUGETLB | VM_PFNMAP or uprobe, then just
fallback to regular path since unmapping those mappings need acquire
write mmap_sem.

For the time being, just do this in munmap syscall path. Other
vm_munmap() or do_munmap() call sites remain intact since it sounds the
complexity to handle race conditions outpace the benefits.

The below is some regression and performance data collected on a machine
with 32 cores of E5-2680 @ 2.70GHz and 384GB memory.

With the patched kernel, write mmap_sem hold time is dropped to us level
from second.

Throughput of page faults (#/s) with the below stress-ng test:
stress-ng --mmap 0 --mmap-bytes 80G --mmap-file --metrics --perf
--timeout 600s
        pristine         patched          delta
       89.41K/sec       97.29K/sec        +8.8%

[1] https://lwn.net/Articles/753269/

Cc: Michal Hocko <mhocko@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
---
 mm/mmap.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 147 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra June 19, 2018, 10:02 a.m. UTC | #1
On Tue, Jun 19, 2018 at 07:34:16AM +0800, Yang Shi wrote:

> diff --git a/mm/mmap.c b/mm/mmap.c
> index fc41c05..e84f80c 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -2686,6 +2686,141 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
>  	return __split_vma(mm, vma, addr, new_below);
>  }
>  
> +/* Consider PUD size or 1GB mapping as large mapping */
> +#ifdef HPAGE_PUD_SIZE
> +#define LARGE_MAP_THRESH	HPAGE_PUD_SIZE
> +#else
> +#define LARGE_MAP_THRESH	(1 * 1024 * 1024 * 1024)
> +#endif
> +
> +/* Unmap large mapping early with acquiring read mmap_sem */
> +static int do_munmap_zap_early(struct mm_struct *mm, unsigned long start,
> +			       size_t len, struct list_head *uf)
> +{
> +	unsigned long end = 0;
> +	struct vm_area_struct *vma = NULL, *prev, *last, *tmp;
> +	bool success = false;
> +	int ret = 0;
> +
> +	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE - start)
> +		return -EINVAL;
> +
> +	len = (PAGE_ALIGN(len));
> +	if (len == 0)
> +		return -EINVAL;
> +
> +	/* Just deal with uf in regular path */
> +	if (unlikely(uf))
> +		goto regular_path;
> +
> +	if (len >= LARGE_MAP_THRESH) {
> +		down_read(&mm->mmap_sem);
> +		vma = find_vma(mm, start);
> +		if (!vma) {
> +			up_read(&mm->mmap_sem);
> +			return 0;
> +		}
> +
> +		prev = vma->vm_prev;
> +
> +		end = start + len;
> +		if (vma->vm_start > end) {
> +			up_read(&mm->mmap_sem);
> +			return 0;
> +		}
> +
> +		if (start > vma->vm_start) {
> +			int error;
> +
> +			if (end < vma->vm_end &&
> +			    mm->map_count > sysctl_max_map_count) {
> +				up_read(&mm->mmap_sem);
> +				return -ENOMEM;
> +			}
> +
> +			error = __split_vma(mm, vma, start, 0);
> +			if (error) {
> +				up_read(&mm->mmap_sem);
> +				return error;
> +			}
> +			prev = vma;
> +		}
> +
> +		last = find_vma(mm, end);
> +		if (last && end > last->vm_start) {
> +			int error = __split_vma(mm, last, end, 1);
> +
> +			if (error) {
> +				up_read(&mm->mmap_sem);
> +				return error;
> +			}
> +		}
> +		vma = prev ? prev->vm_next : mm->mmap;

Hold up, two things: you having to copy most of do_munmap() didn't seem
to suggest a helper function? And second, since when are we allowed to
split VMAs under a read lock?
Yang Shi June 19, 2018, 9:13 p.m. UTC | #2
On 6/19/18 3:02 AM, Peter Zijlstra wrote:
> On Tue, Jun 19, 2018 at 07:34:16AM +0800, Yang Shi wrote:
>
>> diff --git a/mm/mmap.c b/mm/mmap.c
>> index fc41c05..e84f80c 100644
>> --- a/mm/mmap.c
>> +++ b/mm/mmap.c
>> @@ -2686,6 +2686,141 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
>>   	return __split_vma(mm, vma, addr, new_below);
>>   }
>>   
>> +/* Consider PUD size or 1GB mapping as large mapping */
>> +#ifdef HPAGE_PUD_SIZE
>> +#define LARGE_MAP_THRESH	HPAGE_PUD_SIZE
>> +#else
>> +#define LARGE_MAP_THRESH	(1 * 1024 * 1024 * 1024)
>> +#endif
>> +
>> +/* Unmap large mapping early with acquiring read mmap_sem */
>> +static int do_munmap_zap_early(struct mm_struct *mm, unsigned long start,
>> +			       size_t len, struct list_head *uf)
>> +{
>> +	unsigned long end = 0;
>> +	struct vm_area_struct *vma = NULL, *prev, *last, *tmp;
>> +	bool success = false;
>> +	int ret = 0;
>> +
>> +	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE - start)
>> +		return -EINVAL;
>> +
>> +	len = (PAGE_ALIGN(len));
>> +	if (len == 0)
>> +		return -EINVAL;
>> +
>> +	/* Just deal with uf in regular path */
>> +	if (unlikely(uf))
>> +		goto regular_path;
>> +
>> +	if (len >= LARGE_MAP_THRESH) {
>> +		down_read(&mm->mmap_sem);
>> +		vma = find_vma(mm, start);
>> +		if (!vma) {
>> +			up_read(&mm->mmap_sem);
>> +			return 0;
>> +		}
>> +
>> +		prev = vma->vm_prev;
>> +
>> +		end = start + len;
>> +		if (vma->vm_start > end) {
>> +			up_read(&mm->mmap_sem);
>> +			return 0;
>> +		}
>> +
>> +		if (start > vma->vm_start) {
>> +			int error;
>> +
>> +			if (end < vma->vm_end &&
>> +			    mm->map_count > sysctl_max_map_count) {
>> +				up_read(&mm->mmap_sem);
>> +				return -ENOMEM;
>> +			}
>> +
>> +			error = __split_vma(mm, vma, start, 0);
>> +			if (error) {
>> +				up_read(&mm->mmap_sem);
>> +				return error;
>> +			}
>> +			prev = vma;
>> +		}
>> +
>> +		last = find_vma(mm, end);
>> +		if (last && end > last->vm_start) {
>> +			int error = __split_vma(mm, last, end, 1);
>> +
>> +			if (error) {
>> +				up_read(&mm->mmap_sem);
>> +				return error;
>> +			}
>> +		}
>> +		vma = prev ? prev->vm_next : mm->mmap;
> Hold up, two things: you having to copy most of do_munmap() didn't seem
> to suggest a helper function? And second, since when are we allowed to

Yes, they will be extracted into a helper function in the next version.

May bad, I don't think it is allowed. We could reform this to:

acquire write mmap_sem
vma lookup (split vmas)
release write mmap_sem

acquire read mmap_sem
zap pages
release read mmap_sem

I'm supposed this is safe as what Michal said before.

Thanks,
Yang

> split VMAs under a read lock?
Nadav Amit June 19, 2018, 10:17 p.m. UTC | #3
at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote:

> When running some mmap/munmap scalability tests with large memory (i.e.
>> 300GB), the below hung task issue may happen occasionally.
> 
> INFO: task ps:14018 blocked for more than 120 seconds.
>       Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
> message.
> ps              D    0 14018      1 0x00000004
> 
(snip)

> 
> Zapping pages is the most time consuming part, according to the
> suggestion from Michal Hock [1], zapping pages can be done with holding
> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
> mmap_sem to manipulate vmas.

Does munmap() == MADV_DONTNEED + munmap() ?

For example, what happens with userfaultfd in this case? Can you get an
extra #PF, which would be visible to userspace, before the munmap is
finished?

In addition, would it be ok for the user to potentially get a zeroed page in
the time window after the MADV_DONTNEED finished removing a PTE and before
the munmap() is done?

Regards,
Nadav
Yang Shi June 19, 2018, 11:08 p.m. UTC | #4
On 6/19/18 3:17 PM, Nadav Amit wrote:
> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote:
>
>> When running some mmap/munmap scalability tests with large memory (i.e.
>>> 300GB), the below hung task issue may happen occasionally.
>> INFO: task ps:14018 blocked for more than 120 seconds.
>>        Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
>> message.
>> ps              D    0 14018      1 0x00000004
>>
> (snip)
>
>> Zapping pages is the most time consuming part, according to the
>> suggestion from Michal Hock [1], zapping pages can be done with holding
>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
>> mmap_sem to manipulate vmas.
> Does munmap() == MADV_DONTNEED + munmap() ?

Not exactly the same. So, I basically copied the page zapping used by 
munmap instead of calling MADV_DONTNEED.

>
> For example, what happens with userfaultfd in this case? Can you get an
> extra #PF, which would be visible to userspace, before the munmap is
> finished?

userfaultfd is handled by regular munmap path. So, no change to 
userfaultfd part.

>
> In addition, would it be ok for the user to potentially get a zeroed page in
> the time window after the MADV_DONTNEED finished removing a PTE and before
> the munmap() is done?

This should be undefined behavior according to Michal. This has been 
discussed in https://lwn.net/Articles/753269/.

Thanks,
Yang

>
> Regards,
> Nadav
Nadav Amit June 20, 2018, 12:31 a.m. UTC | #5
at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote:

> 
> 
> On 6/19/18 3:17 PM, Nadav Amit wrote:
>> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com>
>>  wrote:
>> 
>> 
>>> When running some mmap/munmap scalability tests with large memory (i.e.
>>> 
>>>> 300GB), the below hung task issue may happen occasionally.
>>>> 
>>> INFO: task ps:14018 blocked for more than 120 seconds.
>>>       Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
>>> message.
>>> ps              D    0 14018      1 0x00000004
>>> 
>>> 
>> (snip)
>> 
>> 
>>> Zapping pages is the most time consuming part, according to the
>>> suggestion from Michal Hock [1], zapping pages can be done with holding
>>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
>>> mmap_sem to manipulate vmas.
>>> 
>> Does munmap() == MADV_DONTNEED + munmap() ?
> 
> Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED.
> 
>> 
>> For example, what happens with userfaultfd in this case? Can you get an
>> extra #PF, which would be visible to userspace, before the munmap is
>> finished?
>> 
> 
> userfaultfd is handled by regular munmap path. So, no change to userfaultfd part.

Right. I see it now.

> 
>> 
>> In addition, would it be ok for the user to potentially get a zeroed page in
>> the time window after the MADV_DONTNEED finished removing a PTE and before
>> the munmap() is done?
>> 
> 
> This should be undefined behavior according to Michal. This has been discussed in  https://lwn.net/Articles/753269/.

Thanks for the reference.

Reading the man page I see: "All pages containing a part of the indicated
range are unmapped, and subsequent references to these pages will generate
SIGSEGV.”

To me it sounds pretty well-defined, and this implementation does not follow
this definition. I would expect the man page to be updated and indicate that
the behavior has changed.

Regards,
Nadav
Michal Hocko June 20, 2018, 7:17 a.m. UTC | #6
On Tue 19-06-18 14:13:05, Yang Shi wrote:
> 
> 
> On 6/19/18 3:02 AM, Peter Zijlstra wrote:
[...]
> > Hold up, two things: you having to copy most of do_munmap() didn't seem
> > to suggest a helper function? And second, since when are we allowed to
> 
> Yes, they will be extracted into a helper function in the next version.
> 
> May bad, I don't think it is allowed. We could reform this to:
> 
> acquire write mmap_sem
> vma lookup (split vmas)
> release write mmap_sem
> 
> acquire read mmap_sem
> zap pages
> release read mmap_sem
> 
> I'm supposed this is safe as what Michal said before.

I didn't get to read your patches carefully yet but I am wondering why
do you need to split in the first place. Why cannot you simply unmap the
range (madvise(DONTNEED)) under the read lock and then take the lock for
write to finish the rest?
Michal Hocko June 20, 2018, 7:18 a.m. UTC | #7
On Tue 19-06-18 17:31:27, Nadav Amit wrote:
> at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote:
> 
> > 
> > 
> > On 6/19/18 3:17 PM, Nadav Amit wrote:
> >> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com>
> >>  wrote:
> >> 
> >> 
> >>> When running some mmap/munmap scalability tests with large memory (i.e.
> >>> 
> >>>> 300GB), the below hung task issue may happen occasionally.
> >>>> 
> >>> INFO: task ps:14018 blocked for more than 120 seconds.
> >>>       Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
> >>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
> >>> message.
> >>> ps              D    0 14018      1 0x00000004
> >>> 
> >>> 
> >> (snip)
> >> 
> >> 
> >>> Zapping pages is the most time consuming part, according to the
> >>> suggestion from Michal Hock [1], zapping pages can be done with holding
> >>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
> >>> mmap_sem to manipulate vmas.
> >>> 
> >> Does munmap() == MADV_DONTNEED + munmap() ?
> > 
> > Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED.
> > 
> >> 
> >> For example, what happens with userfaultfd in this case? Can you get an
> >> extra #PF, which would be visible to userspace, before the munmap is
> >> finished?
> >> 
> > 
> > userfaultfd is handled by regular munmap path. So, no change to userfaultfd part.
> 
> Right. I see it now.
> 
> > 
> >> 
> >> In addition, would it be ok for the user to potentially get a zeroed page in
> >> the time window after the MADV_DONTNEED finished removing a PTE and before
> >> the munmap() is done?
> >> 
> > 
> > This should be undefined behavior according to Michal. This has been discussed in  https://lwn.net/Articles/753269/.
> 
> Thanks for the reference.
> 
> Reading the man page I see: "All pages containing a part of the indicated
> range are unmapped, and subsequent references to these pages will generate
> SIGSEGV.”

Yes, this is true but I guess what Yang Shi meant was that an userspace
access racing with munmap is not well defined. You never know whether
you get your data, #PTF or SEGV because it depends on timing. The user
visible change might be that you lose content and get zero page instead
if you hit the race window while we are unmapping which was not possible
before. But whouldn't such an access pattern be buggy anyway? You need
some form of external synchronization AFAICS.

But maybe some userspace depends on "getting right data or get SEGV"
semantic. If we have to preserve that then we can come up with a VM_DEAD
flag set before we tear it down and force the SEGV on the #PF path.
Something similar we already do for MMF_UNSTABLE.
Yang Shi June 20, 2018, 4:23 p.m. UTC | #8
On 6/20/18 12:17 AM, Michal Hocko wrote:
> On Tue 19-06-18 14:13:05, Yang Shi wrote:
>>
>> On 6/19/18 3:02 AM, Peter Zijlstra wrote:
> [...]
>>> Hold up, two things: you having to copy most of do_munmap() didn't seem
>>> to suggest a helper function? And second, since when are we allowed to
>> Yes, they will be extracted into a helper function in the next version.
>>
>> May bad, I don't think it is allowed. We could reform this to:
>>
>> acquire write mmap_sem
>> vma lookup (split vmas)
>> release write mmap_sem
>>
>> acquire read mmap_sem
>> zap pages
>> release read mmap_sem
>>
>> I'm supposed this is safe as what Michal said before.
> I didn't get to read your patches carefully yet but I am wondering why
> do you need to split in the first place. Why cannot you simply unmap the
> range (madvise(DONTNEED)) under the read lock and then take the lock for
> write to finish the rest?

Yes, we can. I just thought splitting vma up-front sounds more straight 
forward. But, I neglected the write mmap_sem issue. Will move the vma 
split into later write mmap_sem in the next version.

Thanks,
Yang
Nadav Amit June 20, 2018, 5:12 p.m. UTC | #9
at 12:18 AM, Michal Hocko <mhocko@kernel.org> wrote:

> On Tue 19-06-18 17:31:27, Nadav Amit wrote:
>> at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote:
>> 
>>> On 6/19/18 3:17 PM, Nadav Amit wrote:
>>>> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com>
>>>> wrote:
>>>> 
>>>> 
>>>>> When running some mmap/munmap scalability tests with large memory (i.e.
>>>>> 
>>>>>> 300GB), the below hung task issue may happen occasionally.
>>>>> INFO: task ps:14018 blocked for more than 120 seconds.
>>>>>      Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
>>>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
>>>>> message.
>>>>> ps              D    0 14018      1 0x00000004
>>>> (snip)
>>>> 
>>>> 
>>>>> Zapping pages is the most time consuming part, according to the
>>>>> suggestion from Michal Hock [1], zapping pages can be done with holding
>>>>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
>>>>> mmap_sem to manipulate vmas.
>>>> Does munmap() == MADV_DONTNEED + munmap() ?
>>> 
>>> Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED.
>>> 
>>>> For example, what happens with userfaultfd in this case? Can you get an
>>>> extra #PF, which would be visible to userspace, before the munmap is
>>>> finished?
>>> 
>>> userfaultfd is handled by regular munmap path. So, no change to userfaultfd part.
>> 
>> Right. I see it now.
>> 
>>>> In addition, would it be ok for the user to potentially get a zeroed page in
>>>> the time window after the MADV_DONTNEED finished removing a PTE and before
>>>> the munmap() is done?
>>> 
>>> This should be undefined behavior according to Michal. This has been discussed in https://lwn.net/Articles/753269/.
>> 
>> Thanks for the reference.
>> 
>> Reading the man page I see: "All pages containing a part of the indicated
>> range are unmapped, and subsequent references to these pages will generate
>> SIGSEGV.”
> 
> Yes, this is true but I guess what Yang Shi meant was that an userspace
> access racing with munmap is not well defined. You never know whether
> you get your data, #PTF or SEGV because it depends on timing. The user
> visible change might be that you lose content and get zero page instead
> if you hit the race window while we are unmapping which was not possible
> before. But whouldn't such an access pattern be buggy anyway? You need
> some form of external synchronization AFAICS.

It seems to follow the specifications, so it is not clearly buggy IMHO. I
don’t know of such a use-case, but if somebody does so - the proposed change
might even cause a security vulnerability.

> But maybe some userspace depends on "getting right data or get SEGV"
> semantic. If we have to preserve that then we can come up with a VM_DEAD
> flag set before we tear it down and force the SEGV on the #PF path.
> Something similar we already do for MMF_UNSTABLE.

That seems reasonable.

Regards,
Nadav
Yang Shi June 20, 2018, 6:42 p.m. UTC | #10
On 6/20/18 12:18 AM, Michal Hocko wrote:
> On Tue 19-06-18 17:31:27, Nadav Amit wrote:
>> at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote:
>>
>>>
>>> On 6/19/18 3:17 PM, Nadav Amit wrote:
>>>> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com>
>>>>   wrote:
>>>>
>>>>
>>>>> When running some mmap/munmap scalability tests with large memory (i.e.
>>>>>
>>>>>> 300GB), the below hung task issue may happen occasionally.
>>>>>>
>>>>> INFO: task ps:14018 blocked for more than 120 seconds.
>>>>>        Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
>>>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
>>>>> message.
>>>>> ps              D    0 14018      1 0x00000004
>>>>>
>>>>>
>>>> (snip)
>>>>
>>>>
>>>>> Zapping pages is the most time consuming part, according to the
>>>>> suggestion from Michal Hock [1], zapping pages can be done with holding
>>>>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
>>>>> mmap_sem to manipulate vmas.
>>>>>
>>>> Does munmap() == MADV_DONTNEED + munmap() ?
>>> Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED.
>>>
>>>> For example, what happens with userfaultfd in this case? Can you get an
>>>> extra #PF, which would be visible to userspace, before the munmap is
>>>> finished?
>>>>
>>> userfaultfd is handled by regular munmap path. So, no change to userfaultfd part.
>> Right. I see it now.
>>
>>>> In addition, would it be ok for the user to potentially get a zeroed page in
>>>> the time window after the MADV_DONTNEED finished removing a PTE and before
>>>> the munmap() is done?
>>>>
>>> This should be undefined behavior according to Michal. This has been discussed in  https://lwn.net/Articles/753269/.
>> Thanks for the reference.
>>
>> Reading the man page I see: "All pages containing a part of the indicated
>> range are unmapped, and subsequent references to these pages will generate
>> SIGSEGV.”
> Yes, this is true but I guess what Yang Shi meant was that an userspace
> access racing with munmap is not well defined. You never know whether
> you get your data, #PTF or SEGV because it depends on timing. The user
> visible change might be that you lose content and get zero page instead
> if you hit the race window while we are unmapping which was not possible
> before. But whouldn't such an access pattern be buggy anyway? You need
> some form of external synchronization AFAICS.
>
> But maybe some userspace depends on "getting right data or get SEGV"
> semantic. If we have to preserve that then we can come up with a VM_DEAD
> flag set before we tear it down and force the SEGV on the #PF path.
> Something similar we already do for MMF_UNSTABLE.

Set VM_DEAD with read mmap_sem held? It should be ok since this is the 
only place to set this flag for this unique special case.

Yang
Yang Shi June 23, 2018, 1:01 a.m. UTC | #11
Yes, this is true but I guess what Yang Shi meant was that an userspace
>> access racing with munmap is not well defined. You never know whether
>> you get your data, #PTF or SEGV because it depends on timing. The user
>> visible change might be that you lose content and get zero page instead
>> if you hit the race window while we are unmapping which was not possible
>> before. But whouldn't such an access pattern be buggy anyway? You need
>> some form of external synchronization AFAICS.
>>
>> But maybe some userspace depends on "getting right data or get SEGV"
>> semantic. If we have to preserve that then we can come up with a VM_DEAD
>> flag set before we tear it down and force the SEGV on the #PF path.
>> Something similar we already do for MMF_UNSTABLE.
>
> Set VM_DEAD with read mmap_sem held? It should be ok since this is the 
> only place to set this flag for this unique special case.

BTW, it looks the vm flags have used up in 32 bit. If we really need 
VM_DEAD, it should be for both 32-bit and 64-bit.

Any suggestion?

Thanks,
Yang

>
> Yang
>
>
Michal Hocko June 25, 2018, 9:14 a.m. UTC | #12
On Fri 22-06-18 18:01:08, Yang Shi wrote:
> Yes, this is true but I guess what Yang Shi meant was that an userspace
> > > access racing with munmap is not well defined. You never know whether
> > > you get your data, #PTF or SEGV because it depends on timing. The user
> > > visible change might be that you lose content and get zero page instead
> > > if you hit the race window while we are unmapping which was not possible
> > > before. But whouldn't such an access pattern be buggy anyway? You need
> > > some form of external synchronization AFAICS.
> > > 
> > > But maybe some userspace depends on "getting right data or get SEGV"
> > > semantic. If we have to preserve that then we can come up with a VM_DEAD
> > > flag set before we tear it down and force the SEGV on the #PF path.
> > > Something similar we already do for MMF_UNSTABLE.
> > 
> > Set VM_DEAD with read mmap_sem held? It should be ok since this is the
> > only place to set this flag for this unique special case.
> 
> BTW, it looks the vm flags have used up in 32 bit. If we really need
> VM_DEAD, it should be for both 32-bit and 64-bit.

Do we really need any special handling for 32b? Who is going to create
GB mappings for all this to be worth doing?
Yang Shi June 26, 2018, 12:06 a.m. UTC | #13
On 6/20/18 12:18 AM, Michal Hocko wrote:
> On Tue 19-06-18 17:31:27, Nadav Amit wrote:
>> at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote:
>>
>>>
>>> On 6/19/18 3:17 PM, Nadav Amit wrote:
>>>> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com>
>>>>   wrote:
>>>>
>>>>
>>>>> When running some mmap/munmap scalability tests with large memory (i.e.
>>>>>
>>>>>> 300GB), the below hung task issue may happen occasionally.
>>>>>>
>>>>> INFO: task ps:14018 blocked for more than 120 seconds.
>>>>>        Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
>>>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
>>>>> message.
>>>>> ps              D    0 14018      1 0x00000004
>>>>>
>>>>>
>>>> (snip)
>>>>
>>>>
>>>>> Zapping pages is the most time consuming part, according to the
>>>>> suggestion from Michal Hock [1], zapping pages can be done with holding
>>>>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write
>>>>> mmap_sem to manipulate vmas.
>>>>>
>>>> Does munmap() == MADV_DONTNEED + munmap() ?
>>> Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED.
>>>
>>>> For example, what happens with userfaultfd in this case? Can you get an
>>>> extra #PF, which would be visible to userspace, before the munmap is
>>>> finished?
>>>>
>>> userfaultfd is handled by regular munmap path. So, no change to userfaultfd part.
>> Right. I see it now.
>>
>>>> In addition, would it be ok for the user to potentially get a zeroed page in
>>>> the time window after the MADV_DONTNEED finished removing a PTE and before
>>>> the munmap() is done?
>>>>
>>> This should be undefined behavior according to Michal. This has been discussed in  https://lwn.net/Articles/753269/.
>> Thanks for the reference.
>>
>> Reading the man page I see: "All pages containing a part of the indicated
>> range are unmapped, and subsequent references to these pages will generate
>> SIGSEGV.”
> Yes, this is true but I guess what Yang Shi meant was that an userspace
> access racing with munmap is not well defined. You never know whether
> you get your data, #PTF or SEGV because it depends on timing. The user
> visible change might be that you lose content and get zero page instead
> if you hit the race window while we are unmapping which was not possible
> before. But whouldn't such an access pattern be buggy anyway? You need
> some form of external synchronization AFAICS.
>
> But maybe some userspace depends on "getting right data or get SEGV"
> semantic. If we have to preserve that then we can come up with a VM_DEAD
> flag set before we tear it down and force the SEGV on the #PF path.
> Something similar we already do for MMF_UNSTABLE.

By looking this deeper, we may not be able to cover all the unmapping 
range for VM_DEAD, for example, if the start addr is in the middle of a 
vma. We can't set VM_DEAD to that vma since that would trigger SIGSEGV 
for still mapped area.

splitting can't be done with read mmap_sem held, so maybe just set 
VM_DEAD to non-overlapped vmas. Access to overlapped vmas (first and 
last) will still have undefined behavior.

Thanks,
Yang
Peter Zijlstra June 26, 2018, 7:43 a.m. UTC | #14
On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
> By looking this deeper, we may not be able to cover all the unmapping range
> for VM_DEAD, for example, if the start addr is in the middle of a vma. We
> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
> mapped area.
> 
> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
> to non-overlapped vmas. Access to overlapped vmas (first and last) will
> still have undefined behavior.

Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
writing, free everything left, drop mmap_sem.

?

Sure, you acquire the lock 3 times, but both write instances should be
'short', and I suppose you can do a demote between 1 and 2 if you care.
Yang Shi June 27, 2018, 1:03 a.m. UTC | #15
On 6/26/18 12:43 AM, Peter Zijlstra wrote:
> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
>> By looking this deeper, we may not be able to cover all the unmapping range
>> for VM_DEAD, for example, if the start addr is in the middle of a vma. We
>> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
>> mapped area.
>>
>> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
>> to non-overlapped vmas. Access to overlapped vmas (first and last) will
>> still have undefined behavior.
> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
> writing, free everything left, drop mmap_sem.
>
> ?
>
> Sure, you acquire the lock 3 times, but both write instances should be
> 'short', and I suppose you can do a demote between 1 and 2 if you care.

Thanks, Peter. Yes, by looking the code and trying two different 
approaches, it looks this approach is the most straight-forward one.

Splitting vma up-front can save a lot pain later. Holding write mmap_sem 
for this job before zapping mappings sounds worth the cost (very short 
write critical section).

And, VM_DEAD can be set exclusively with write mmap_sem without racing 
with page faults, this will give us consistent behavior for the race 
between PF and munmap. And, we don't need care about overlapped vma 
since it has been split before.

Yang
Michal Hocko June 27, 2018, 7:24 a.m. UTC | #16
On Tue 26-06-18 18:03:34, Yang Shi wrote:
> 
> 
> On 6/26/18 12:43 AM, Peter Zijlstra wrote:
> > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
> > > By looking this deeper, we may not be able to cover all the unmapping range
> > > for VM_DEAD, for example, if the start addr is in the middle of a vma. We
> > > can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
> > > mapped area.
> > > 
> > > splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
> > > to non-overlapped vmas. Access to overlapped vmas (first and last) will
> > > still have undefined behavior.
> > Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
> > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
> > writing, free everything left, drop mmap_sem.
> > 
> > ?
> > 
> > Sure, you acquire the lock 3 times, but both write instances should be
> > 'short', and I suppose you can do a demote between 1 and 2 if you care.
> 
> Thanks, Peter. Yes, by looking the code and trying two different approaches,
> it looks this approach is the most straight-forward one.

Yes, you just have to be careful about the max vma count limit.
Yang Shi June 27, 2018, 5:23 p.m. UTC | #17
On 6/27/18 12:24 AM, Michal Hocko wrote:
> On Tue 26-06-18 18:03:34, Yang Shi wrote:
>>
>> On 6/26/18 12:43 AM, Peter Zijlstra wrote:
>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
>>>> By looking this deeper, we may not be able to cover all the unmapping range
>>>> for VM_DEAD, for example, if the start addr is in the middle of a vma. We
>>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
>>>> mapped area.
>>>>
>>>> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
>>>> to non-overlapped vmas. Access to overlapped vmas (first and last) will
>>>> still have undefined behavior.
>>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
>>> writing, free everything left, drop mmap_sem.
>>>
>>> ?
>>>
>>> Sure, you acquire the lock 3 times, but both write instances should be
>>> 'short', and I suppose you can do a demote between 1 and 2 if you care.
>> Thanks, Peter. Yes, by looking the code and trying two different approaches,
>> it looks this approach is the most straight-forward one.
> Yes, you just have to be careful about the max vma count limit.

Yes, we should just need copy what do_munmap does as below:

if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
             return -ENOMEM;

If the mas map count limit has been reached, it will return failure 
before zapping mappings.

Thanks,
Yang
Michal Hocko June 28, 2018, 11:51 a.m. UTC | #18
On Wed 27-06-18 10:23:39, Yang Shi wrote:
> 
> 
> On 6/27/18 12:24 AM, Michal Hocko wrote:
> > On Tue 26-06-18 18:03:34, Yang Shi wrote:
> > > 
> > > On 6/26/18 12:43 AM, Peter Zijlstra wrote:
> > > > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
> > > > > By looking this deeper, we may not be able to cover all the unmapping range
> > > > > for VM_DEAD, for example, if the start addr is in the middle of a vma. We
> > > > > can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
> > > > > mapped area.
> > > > > 
> > > > > splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
> > > > > to non-overlapped vmas. Access to overlapped vmas (first and last) will
> > > > > still have undefined behavior.
> > > > Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
> > > > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
> > > > writing, free everything left, drop mmap_sem.
> > > > 
> > > > ?
> > > > 
> > > > Sure, you acquire the lock 3 times, but both write instances should be
> > > > 'short', and I suppose you can do a demote between 1 and 2 if you care.
> > > Thanks, Peter. Yes, by looking the code and trying two different approaches,
> > > it looks this approach is the most straight-forward one.
> > Yes, you just have to be careful about the max vma count limit.
> 
> Yes, we should just need copy what do_munmap does as below:
> 
> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>             return -ENOMEM;
> 
> If the mas map count limit has been reached, it will return failure before
> zapping mappings.

Yeah, but as soon as you drop the lock and retake it, somebody might
have changed the adddress space and we might get inconsistency.

So I am wondering whether we really need upgrade_read (to promote read
to write lock) and do the
	down_write
	split & set up VM_DEAD
	downgrade_write
	unmap
	upgrade_read
	zap ptes
	up_write

looks terrible, no question about that, but we won't drop the mmap sem
at any time.
Yang Shi June 28, 2018, 7:10 p.m. UTC | #19
On 6/28/18 4:51 AM, Michal Hocko wrote:
> On Wed 27-06-18 10:23:39, Yang Shi wrote:
>>
>> On 6/27/18 12:24 AM, Michal Hocko wrote:
>>> On Tue 26-06-18 18:03:34, Yang Shi wrote:
>>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote:
>>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
>>>>>> By looking this deeper, we may not be able to cover all the unmapping range
>>>>>> for VM_DEAD, for example, if the start addr is in the middle of a vma. We
>>>>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
>>>>>> mapped area.
>>>>>>
>>>>>> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
>>>>>> to non-overlapped vmas. Access to overlapped vmas (first and last) will
>>>>>> still have undefined behavior.
>>>>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
>>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
>>>>> writing, free everything left, drop mmap_sem.
>>>>>
>>>>> ?
>>>>>
>>>>> Sure, you acquire the lock 3 times, but both write instances should be
>>>>> 'short', and I suppose you can do a demote between 1 and 2 if you care.
>>>> Thanks, Peter. Yes, by looking the code and trying two different approaches,
>>>> it looks this approach is the most straight-forward one.
>>> Yes, you just have to be careful about the max vma count limit.
>> Yes, we should just need copy what do_munmap does as below:
>>
>> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>>              return -ENOMEM;
>>
>> If the mas map count limit has been reached, it will return failure before
>> zapping mappings.
> Yeah, but as soon as you drop the lock and retake it, somebody might
> have changed the adddress space and we might get inconsistency.
>
> So I am wondering whether we really need upgrade_read (to promote read
> to write lock) and do the
> 	down_write
> 	split & set up VM_DEAD
> 	downgrade_write
> 	unmap
> 	upgrade_read
> 	zap ptes
> 	up_write

I'm supposed address space changing just can be done by mmap, mremap, 
mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD 
flag is set for the vma, just return failure since it is being unmapped.

Does it sounds reasonable?

Thanks,
Yang

>
> looks terrible, no question about that, but we won't drop the mmap sem
> at any time.
Yang Shi June 29, 2018, 12:59 a.m. UTC | #20
On 6/28/18 12:10 PM, Yang Shi wrote:
>
>
> On 6/28/18 4:51 AM, Michal Hocko wrote:
>> On Wed 27-06-18 10:23:39, Yang Shi wrote:
>>>
>>> On 6/27/18 12:24 AM, Michal Hocko wrote:
>>>> On Tue 26-06-18 18:03:34, Yang Shi wrote:
>>>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote:
>>>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
>>>>>>> By looking this deeper, we may not be able to cover all the 
>>>>>>> unmapping range
>>>>>>> for VM_DEAD, for example, if the start addr is in the middle of 
>>>>>>> a vma. We
>>>>>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV 
>>>>>>> for still
>>>>>>> mapped area.
>>>>>>>
>>>>>>> splitting can't be done with read mmap_sem held, so maybe just 
>>>>>>> set VM_DEAD
>>>>>>> to non-overlapped vmas. Access to overlapped vmas (first and 
>>>>>>> last) will
>>>>>>> still have undefined behavior.
>>>>>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. 
>>>>>> Acquire
>>>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
>>>>>> writing, free everything left, drop mmap_sem.
>>>>>>
>>>>>> ?
>>>>>>
>>>>>> Sure, you acquire the lock 3 times, but both write instances 
>>>>>> should be
>>>>>> 'short', and I suppose you can do a demote between 1 and 2 if you 
>>>>>> care.
>>>>> Thanks, Peter. Yes, by looking the code and trying two different 
>>>>> approaches,
>>>>> it looks this approach is the most straight-forward one.
>>>> Yes, you just have to be careful about the max vma count limit.
>>> Yes, we should just need copy what do_munmap does as below:
>>>
>>> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>>>              return -ENOMEM;
>>>
>>> If the mas map count limit has been reached, it will return failure 
>>> before
>>> zapping mappings.
>> Yeah, but as soon as you drop the lock and retake it, somebody might
>> have changed the adddress space and we might get inconsistency.
>>
>> So I am wondering whether we really need upgrade_read (to promote read
>> to write lock) and do the
>>     down_write
>>     split & set up VM_DEAD
>>     downgrade_write
>>     unmap
>>     upgrade_read
>>     zap ptes
>>     up_write

Promoting to write lock may be a trouble. There might be other users in 
the critical section with read lock, we have to wait them to finish.

>
> I'm supposed address space changing just can be done by mmap, mremap, 
> mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD 
> flag is set for the vma, just return failure since it is being unmapped.
>
> Does it sounds reasonable?

It looks we just need care about MAP_FIXED (mmap) and MREMAP_FIXED 
(mremap), right?

How about letting them return -EBUSY or -EAGAIN to notify the 
application? This changes the behavior a little bit, MAP_FIXED and 
mremap may fail if they fail the race with munmap (if the mapping is 
larger than 1GB). I'm not sure if any multi-threaded application uses 
MAP_FIXED and MREMAP_FIXED very heavily which may run into the race 
condition. I guess it should be rare to meet all the conditions to 
trigger the race.

The programmer should be very cautious about MAP_FIXED.MREMAP_FIXED 
since they may corrupt its own address space as the man page noted.


Thanks,
Yang

>
> Thanks,
> Yang
>
>>
>> looks terrible, no question about that, but we won't drop the mmap sem
>> at any time.
>
Michal Hocko June 29, 2018, 11:34 a.m. UTC | #21
On Thu 28-06-18 12:10:10, Yang Shi wrote:
> 
> 
> On 6/28/18 4:51 AM, Michal Hocko wrote:
> > On Wed 27-06-18 10:23:39, Yang Shi wrote:
> > > 
> > > On 6/27/18 12:24 AM, Michal Hocko wrote:
> > > > On Tue 26-06-18 18:03:34, Yang Shi wrote:
> > > > > On 6/26/18 12:43 AM, Peter Zijlstra wrote:
> > > > > > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
> > > > > > > By looking this deeper, we may not be able to cover all the unmapping range
> > > > > > > for VM_DEAD, for example, if the start addr is in the middle of a vma. We
> > > > > > > can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
> > > > > > > mapped area.
> > > > > > > 
> > > > > > > splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
> > > > > > > to non-overlapped vmas. Access to overlapped vmas (first and last) will
> > > > > > > still have undefined behavior.
> > > > > > Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
> > > > > > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
> > > > > > writing, free everything left, drop mmap_sem.
> > > > > > 
> > > > > > ?
> > > > > > 
> > > > > > Sure, you acquire the lock 3 times, but both write instances should be
> > > > > > 'short', and I suppose you can do a demote between 1 and 2 if you care.
> > > > > Thanks, Peter. Yes, by looking the code and trying two different approaches,
> > > > > it looks this approach is the most straight-forward one.
> > > > Yes, you just have to be careful about the max vma count limit.
> > > Yes, we should just need copy what do_munmap does as below:
> > > 
> > > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > >              return -ENOMEM;
> > > 
> > > If the mas map count limit has been reached, it will return failure before
> > > zapping mappings.
> > Yeah, but as soon as you drop the lock and retake it, somebody might
> > have changed the adddress space and we might get inconsistency.
> > 
> > So I am wondering whether we really need upgrade_read (to promote read
> > to write lock) and do the
> > 	down_write
> > 	split & set up VM_DEAD
> > 	downgrade_write
> > 	unmap
> > 	upgrade_read
> > 	zap ptes
> > 	up_write
> 
> I'm supposed address space changing just can be done by mmap, mremap,
> mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD flag is
> set for the vma, just return failure since it is being unmapped.

I am sorry I do not follow. How does VM_DEAD flag helps for a completely
unrelated vmas? Or maybe it would be better to post the code to see what
you mean exactly.
Michal Hocko June 29, 2018, 11:39 a.m. UTC | #22
On Thu 28-06-18 17:59:25, Yang Shi wrote:
> 
> 
> On 6/28/18 12:10 PM, Yang Shi wrote:
> > 
> > 
> > On 6/28/18 4:51 AM, Michal Hocko wrote:
> > > On Wed 27-06-18 10:23:39, Yang Shi wrote:
> > > > 
> > > > On 6/27/18 12:24 AM, Michal Hocko wrote:
> > > > > On Tue 26-06-18 18:03:34, Yang Shi wrote:
> > > > > > On 6/26/18 12:43 AM, Peter Zijlstra wrote:
> > > > > > > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
> > > > > > > > By looking this deeper, we may not be able to
> > > > > > > > cover all the unmapping range
> > > > > > > > for VM_DEAD, for example, if the start addr is
> > > > > > > > in the middle of a vma. We
> > > > > > > > can't set VM_DEAD to that vma since that would
> > > > > > > > trigger SIGSEGV for still
> > > > > > > > mapped area.
> > > > > > > > 
> > > > > > > > splitting can't be done with read mmap_sem held,
> > > > > > > > so maybe just set VM_DEAD
> > > > > > > > to non-overlapped vmas. Access to overlapped
> > > > > > > > vmas (first and last) will
> > > > > > > > still have undefined behavior.
> > > > > > > Acquire mmap_sem for writing, split, mark VM_DEAD,
> > > > > > > drop mmap_sem. Acquire
> > > > > > > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
> > > > > > > writing, free everything left, drop mmap_sem.
> > > > > > > 
> > > > > > > ?
> > > > > > > 
> > > > > > > Sure, you acquire the lock 3 times, but both write
> > > > > > > instances should be
> > > > > > > 'short', and I suppose you can do a demote between 1
> > > > > > > and 2 if you care.
> > > > > > Thanks, Peter. Yes, by looking the code and trying two
> > > > > > different approaches,
> > > > > > it looks this approach is the most straight-forward one.
> > > > > Yes, you just have to be careful about the max vma count limit.
> > > > Yes, we should just need copy what do_munmap does as below:
> > > > 
> > > > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > > >              return -ENOMEM;
> > > > 
> > > > If the mas map count limit has been reached, it will return
> > > > failure before
> > > > zapping mappings.
> > > Yeah, but as soon as you drop the lock and retake it, somebody might
> > > have changed the adddress space and we might get inconsistency.
> > > 
> > > So I am wondering whether we really need upgrade_read (to promote read
> > > to write lock) and do the
> > >     down_write
> > >     split & set up VM_DEAD
> > >     downgrade_write
> > >     unmap
> > >     upgrade_read
> > >     zap ptes
> > >     up_write
> 
> Promoting to write lock may be a trouble. There might be other users in the
> critical section with read lock, we have to wait them to finish.

Yes. Is that a problem though?
 
> > I'm supposed address space changing just can be done by mmap, mremap,
> > mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD
> > flag is set for the vma, just return failure since it is being unmapped.
> > 
> > Does it sounds reasonable?
> 
> It looks we just need care about MAP_FIXED (mmap) and MREMAP_FIXED (mremap),
> right?
> 
> How about letting them return -EBUSY or -EAGAIN to notify the application?

Well, non of those is documented to return EBUSY and EAGAIN already has
a meaning for locked memory.

> This changes the behavior a little bit, MAP_FIXED and mremap may fail if
> they fail the race with munmap (if the mapping is larger than 1GB). I'm not
> sure if any multi-threaded application uses MAP_FIXED and MREMAP_FIXED very
> heavily which may run into the race condition. I guess it should be rare to
> meet all the conditions to trigger the race.
> 
> The programmer should be very cautious about MAP_FIXED.MREMAP_FIXED since
> they may corrupt its own address space as the man page noted.

Well, I suspect you are overcomplicating this a bit. This should be
really straightforward thing - well except for VM_DEAD which is quite
tricky already. We should rather not spread this trickyness outside of
the #PF path. And I would even try hard to start that part simple to see
whether it actually matters. Relying on races between threads without
any locking is quite questionable already. Nobody has pointed to a sane
usecase so far.
Yang Shi June 29, 2018, 4:45 p.m. UTC | #23
On 6/29/18 4:34 AM, Michal Hocko wrote:
> On Thu 28-06-18 12:10:10, Yang Shi wrote:
>>
>> On 6/28/18 4:51 AM, Michal Hocko wrote:
>>> On Wed 27-06-18 10:23:39, Yang Shi wrote:
>>>> On 6/27/18 12:24 AM, Michal Hocko wrote:
>>>>> On Tue 26-06-18 18:03:34, Yang Shi wrote:
>>>>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote:
>>>>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
>>>>>>>> By looking this deeper, we may not be able to cover all the unmapping range
>>>>>>>> for VM_DEAD, for example, if the start addr is in the middle of a vma. We
>>>>>>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still
>>>>>>>> mapped area.
>>>>>>>>
>>>>>>>> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD
>>>>>>>> to non-overlapped vmas. Access to overlapped vmas (first and last) will
>>>>>>>> still have undefined behavior.
>>>>>>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire
>>>>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
>>>>>>> writing, free everything left, drop mmap_sem.
>>>>>>>
>>>>>>> ?
>>>>>>>
>>>>>>> Sure, you acquire the lock 3 times, but both write instances should be
>>>>>>> 'short', and I suppose you can do a demote between 1 and 2 if you care.
>>>>>> Thanks, Peter. Yes, by looking the code and trying two different approaches,
>>>>>> it looks this approach is the most straight-forward one.
>>>>> Yes, you just have to be careful about the max vma count limit.
>>>> Yes, we should just need copy what do_munmap does as below:
>>>>
>>>> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>>>>               return -ENOMEM;
>>>>
>>>> If the mas map count limit has been reached, it will return failure before
>>>> zapping mappings.
>>> Yeah, but as soon as you drop the lock and retake it, somebody might
>>> have changed the adddress space and we might get inconsistency.
>>>
>>> So I am wondering whether we really need upgrade_read (to promote read
>>> to write lock) and do the
>>> 	down_write
>>> 	split & set up VM_DEAD
>>> 	downgrade_write
>>> 	unmap
>>> 	upgrade_read
>>> 	zap ptes
>>> 	up_write
>> I'm supposed address space changing just can be done by mmap, mremap,
>> mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD flag is
>> set for the vma, just return failure since it is being unmapped.
> I am sorry I do not follow. How does VM_DEAD flag helps for a completely
> unrelated vmas? Or maybe it would be better to post the code to see what
> you mean exactly.

I mean we just care about the vmas which have been found/split by 
munmap, right? We already set VM_DEAD to them. Even though those other 
vmas are changed by somebody else, it would not cause any inconsistency 
to this munmap call.
Yang Shi June 29, 2018, 4:50 p.m. UTC | #24
On 6/29/18 4:39 AM, Michal Hocko wrote:
> On Thu 28-06-18 17:59:25, Yang Shi wrote:
>>
>> On 6/28/18 12:10 PM, Yang Shi wrote:
>>>
>>> On 6/28/18 4:51 AM, Michal Hocko wrote:
>>>> On Wed 27-06-18 10:23:39, Yang Shi wrote:
>>>>> On 6/27/18 12:24 AM, Michal Hocko wrote:
>>>>>> On Tue 26-06-18 18:03:34, Yang Shi wrote:
>>>>>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote:
>>>>>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote:
>>>>>>>>> By looking this deeper, we may not be able to
>>>>>>>>> cover all the unmapping range
>>>>>>>>> for VM_DEAD, for example, if the start addr is
>>>>>>>>> in the middle of a vma. We
>>>>>>>>> can't set VM_DEAD to that vma since that would
>>>>>>>>> trigger SIGSEGV for still
>>>>>>>>> mapped area.
>>>>>>>>>
>>>>>>>>> splitting can't be done with read mmap_sem held,
>>>>>>>>> so maybe just set VM_DEAD
>>>>>>>>> to non-overlapped vmas. Access to overlapped
>>>>>>>>> vmas (first and last) will
>>>>>>>>> still have undefined behavior.
>>>>>>>> Acquire mmap_sem for writing, split, mark VM_DEAD,
>>>>>>>> drop mmap_sem. Acquire
>>>>>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for
>>>>>>>> writing, free everything left, drop mmap_sem.
>>>>>>>>
>>>>>>>> ?
>>>>>>>>
>>>>>>>> Sure, you acquire the lock 3 times, but both write
>>>>>>>> instances should be
>>>>>>>> 'short', and I suppose you can do a demote between 1
>>>>>>>> and 2 if you care.
>>>>>>> Thanks, Peter. Yes, by looking the code and trying two
>>>>>>> different approaches,
>>>>>>> it looks this approach is the most straight-forward one.
>>>>>> Yes, you just have to be careful about the max vma count limit.
>>>>> Yes, we should just need copy what do_munmap does as below:
>>>>>
>>>>> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
>>>>>               return -ENOMEM;
>>>>>
>>>>> If the mas map count limit has been reached, it will return
>>>>> failure before
>>>>> zapping mappings.
>>>> Yeah, but as soon as you drop the lock and retake it, somebody might
>>>> have changed the adddress space and we might get inconsistency.
>>>>
>>>> So I am wondering whether we really need upgrade_read (to promote read
>>>> to write lock) and do the
>>>>      down_write
>>>>      split & set up VM_DEAD
>>>>      downgrade_write
>>>>      unmap
>>>>      upgrade_read
>>>>      zap ptes
>>>>      up_write
>> Promoting to write lock may be a trouble. There might be other users in the
>> critical section with read lock, we have to wait them to finish.
> Yes. Is that a problem though?

Not a problem, but just not sure how complicated it would be. 
Considering all the lock debug/lockdep stuff.

And, the behavior smells like rcu.

>   
>>> I'm supposed address space changing just can be done by mmap, mremap,
>>> mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD
>>> flag is set for the vma, just return failure since it is being unmapped.
>>>
>>> Does it sounds reasonable?
>> It looks we just need care about MAP_FIXED (mmap) and MREMAP_FIXED (mremap),
>> right?
>>
>> How about letting them return -EBUSY or -EAGAIN to notify the application?
> Well, non of those is documented to return EBUSY and EAGAIN already has
> a meaning for locked memory.
>
>> This changes the behavior a little bit, MAP_FIXED and mremap may fail if
>> they fail the race with munmap (if the mapping is larger than 1GB). I'm not
>> sure if any multi-threaded application uses MAP_FIXED and MREMAP_FIXED very
>> heavily which may run into the race condition. I guess it should be rare to
>> meet all the conditions to trigger the race.
>>
>> The programmer should be very cautious about MAP_FIXED.MREMAP_FIXED since
>> they may corrupt its own address space as the man page noted.
> Well, I suspect you are overcomplicating this a bit. This should be
> really straightforward thing - well except for VM_DEAD which is quite
> tricky already. We should rather not spread this trickyness outside of
> the #PF path. And I would even try hard to start that part simple to see
> whether it actually matters. Relying on races between threads without
> any locking is quite questionable already. Nobody has pointed to a sane
> usecase so far.

I agree to keep it as simple as possible then see if it matters or not. 
So, in v3 I will just touch the page fault path.
diff mbox

Patch

diff --git a/mm/mmap.c b/mm/mmap.c
index fc41c05..e84f80c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2686,6 +2686,141 @@  int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	return __split_vma(mm, vma, addr, new_below);
 }
 
+/* Consider PUD size or 1GB mapping as large mapping */
+#ifdef HPAGE_PUD_SIZE
+#define LARGE_MAP_THRESH	HPAGE_PUD_SIZE
+#else
+#define LARGE_MAP_THRESH	(1 * 1024 * 1024 * 1024)
+#endif
+
+/* Unmap large mapping early with acquiring read mmap_sem */
+static int do_munmap_zap_early(struct mm_struct *mm, unsigned long start,
+			       size_t len, struct list_head *uf)
+{
+	unsigned long end = 0;
+	struct vm_area_struct *vma = NULL, *prev, *last, *tmp;
+	bool success = false;
+	int ret = 0;
+
+	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE - start)
+		return -EINVAL;
+
+	len = (PAGE_ALIGN(len));
+	if (len == 0)
+		return -EINVAL;
+
+	/* Just deal with uf in regular path */
+	if (unlikely(uf))
+		goto regular_path;
+
+	if (len >= LARGE_MAP_THRESH) {
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, start);
+		if (!vma) {
+			up_read(&mm->mmap_sem);
+			return 0;
+		}
+
+		prev = vma->vm_prev;
+
+		end = start + len;
+		if (vma->vm_start > end) {
+			up_read(&mm->mmap_sem);
+			return 0;
+		}
+
+		if (start > vma->vm_start) {
+			int error;
+
+			if (end < vma->vm_end &&
+			    mm->map_count > sysctl_max_map_count) {
+				up_read(&mm->mmap_sem);
+				return -ENOMEM;
+			}
+
+			error = __split_vma(mm, vma, start, 0);
+			if (error) {
+				up_read(&mm->mmap_sem);
+				return error;
+			}
+			prev = vma;
+		}
+
+		last = find_vma(mm, end);
+		if (last && end > last->vm_start) {
+			int error = __split_vma(mm, last, end, 1);
+
+			if (error) {
+				up_read(&mm->mmap_sem);
+				return error;
+			}
+		}
+		vma = prev ? prev->vm_next : mm->mmap;
+
+		/*
+		 * Unmapping vmas, which has VM_LOCKED|VM_HUGETLB|VM_PFNMAP
+		 * flag set or has uprobes set, need acquire write map_sem,
+		 * so skip them in early zap. Just deal with such mapping in
+		 * regular path.
+		 * Borrow can_madv_dontneed_vma() to check the conditions.
+		 */
+		tmp = vma;
+		while (tmp && tmp->vm_start < end) {
+			if (!can_madv_dontneed_vma(tmp) ||
+			    vma_has_uprobes(tmp, start, end))
+				goto sem_drop;
+			tmp = tmp->vm_next;
+		}
+
+		unmap_region(mm, vma, prev, start, end);
+		/* indicates early zap is success */
+		success = true;
+
+sem_drop:
+		up_read(&mm->mmap_sem);
+	}
+
+regular_path:
+	/* hold write mmap_sem for vma manipulation or regular path */
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+	if (success) {
+		/* vmas have been zapped, here just deal with loose end */
+		detach_vmas_to_be_unmapped(mm, vma, prev, end);
+		arch_unmap(mm, vma, start, end);
+		remove_vma_list(mm, vma);
+	} else {
+		/* vma is VM_LOCKED|VM_HUGETLB|VM_PFNMAP or has uprobe */
+		if (vma) {
+			if (unlikely(uf)) {
+				int ret = userfaultfd_unmap_prep(vma, start,
+							end, uf);
+				if (ret)
+					goto out;
+			}
+			if (mm->locked_vm) {
+				tmp = vma;
+				while (tmp && tmp->vm_start < end) {
+					if (tmp->vm_flags & VM_LOCKED) {
+						mm->locked_vm -= vma_pages(tmp);
+						munlock_vma_pages_all(tmp);
+					}
+					tmp = tmp->vm_next;
+				}
+			}
+			detach_vmas_to_be_unmapped(mm, vma, prev, end);
+			unmap_region(mm, vma, prev, start, end);
+			remove_vma_list(mm, vma);
+		} else
+			/* When mapping size < LARGE_MAP_THRESH */
+			ret = do_munmap(mm, start, len, uf);
+	}
+
+out:
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
  * work.  This now handles partial unmappings.
@@ -2792,6 +2927,17 @@  int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 	return 0;
 }
 
+static int vm_munmap_zap_early(unsigned long start, size_t len)
+{
+	int ret;
+	struct mm_struct *mm = current->mm;
+	LIST_HEAD(uf);
+
+	ret = do_munmap_zap_early(mm, start, len, &uf);
+	userfaultfd_unmap_complete(mm, &uf);
+	return ret;
+}
+
 int vm_munmap(unsigned long start, size_t len)
 {
 	int ret;
@@ -2811,7 +2957,7 @@  int vm_munmap(unsigned long start, size_t len)
 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
 	profile_munmap(addr);
-	return vm_munmap(addr, len);
+	return vm_munmap_zap_early(addr, len);
 }