diff mbox series

[v2,7/9] mm: vmalloc: Support multiple nodes in vread_iter

Message ID 20230829081142.3619-8-urezki@gmail.com (mailing list archive)
State New
Headers show
Series Mitigate a vmap lock contention v2 | expand

Commit Message

Uladzislau Rezki Aug. 29, 2023, 8:11 a.m. UTC
Extend the vread_iter() to be able to perform a sequential
reading of VAs which are spread among multiple nodes. So a
data read over the /dev/kmem correctly reflects a vmalloc
memory layout.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 14 deletions(-)

Comments

Baoquan He Sept. 11, 2023, 3:58 a.m. UTC | #1
On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> Extend the vread_iter() to be able to perform a sequential
> reading of VAs which are spread among multiple nodes. So a
> data read over the /dev/kmem correctly reflects a vmalloc
> memory layout.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 53 insertions(+), 14 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 4fd4915c532d..968144c16237 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
......  
> @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
>  
>  	remains = count;
>  
> -	/* Hooked to node_0 so far. */
> -	vn = addr_to_node(0);
> -	spin_lock(&vn->busy.lock);

This could change the vread behaviour a little bit. Before, once we take
vmap_area_lock, the vread will read out the content of snapshot at the
moment. Now, reading out in one node's tree won't disrupt other nodes'
tree accessing. Not sure if this matters when people need access
/proc/kcore, e.g dynamic debugging.

And, the reading will be a little slower because each va finding need
iterate all vmap_nodes[].

Otherwise, the patch itself looks good to me.

Reviewed-by: Baoquan He <bhe@redhat.com>

> -
> -	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
> -	if (!va)
> +	vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
> +	if (!vn)
>  		goto finished_zero;
>  
>  	/* no intersects with alive vmap_area */
>  	if ((unsigned long)addr + remains <= va->va_start)
>  		goto finished_zero;
>  
> -	list_for_each_entry_from(va, &vn->busy.head, list) {
> +	do {
>  		size_t copied;
>  
>  		if (remains == 0)
> @@ -4084,10 +4116,10 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
>  		WARN_ON(flags == VMAP_BLOCK);
>  
>  		if (!vm && !flags)
> -			continue;
> +			goto next_va;
>  
>  		if (vm && (vm->flags & VM_UNINITIALIZED))
> -			continue;
> +			goto next_va;
>  
>  		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
>  		smp_rmb();
> @@ -4096,7 +4128,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
>  		size = vm ? get_vm_area_size(vm) : va_size(va);
>  
>  		if (addr >= vaddr + size)
> -			continue;
> +			goto next_va;
>  
>  		if (addr < vaddr) {
>  			size_t to_zero = min_t(size_t, vaddr - addr, remains);
> @@ -4125,15 +4157,22 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
>  
>  		if (copied != n)
>  			goto finished;
> -	}
> +
> +	next_va:
> +		next = va->va_end;
> +		spin_unlock(&vn->busy.lock);
> +	} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
>  
>  finished_zero:
> -	spin_unlock(&vn->busy.lock);
> +	if (vn)
> +		spin_unlock(&vn->busy.lock);
> +
>  	/* zero-fill memory holes */
>  	return count - remains + zero_iter(iter, remains);
>  finished:
>  	/* Nothing remains, or We couldn't copy/zero everything. */
> -	spin_unlock(&vn->busy.lock);
> +	if (vn)
> +		spin_unlock(&vn->busy.lock);
>  
>  	return count - remains;
>  }
> -- 
> 2.30.2
>
Uladzislau Rezki Sept. 11, 2023, 6:16 p.m. UTC | #2
On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > Extend the vread_iter() to be able to perform a sequential
> > reading of VAs which are spread among multiple nodes. So a
> > data read over the /dev/kmem correctly reflects a vmalloc
> > memory layout.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> >  1 file changed, 53 insertions(+), 14 deletions(-)
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 4fd4915c532d..968144c16237 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> ......  
> > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> >  
> >  	remains = count;
> >  
> > -	/* Hooked to node_0 so far. */
> > -	vn = addr_to_node(0);
> > -	spin_lock(&vn->busy.lock);
> 
> This could change the vread behaviour a little bit. Before, once we take
> vmap_area_lock, the vread will read out the content of snapshot at the
> moment. Now, reading out in one node's tree won't disrupt other nodes'
> tree accessing. Not sure if this matters when people need access
> /proc/kcore, e.g dynamic debugging.
>
With one big tree you anyway drop the lock after one cycle of reading.
As far as i see, kcore.c's read granularity is a PAGE_SIZE.

Please correct me if i am wrong.

> 
> And, the reading will be a little slower because each va finding need
> iterate all vmap_nodes[].
> 
Right. It is a bit tough here, because we have multiple nodes which
represent zones(address space), i.e. there is an offset between them,
it means that, reading fully one tree, will not provide a sequential
reading.

>
> Otherwise, the patch itself looks good to me.
> 
> Reviewed-by: Baoquan He <bhe@redhat.com>
>
Applied.

Thank you for looking at it!

--
Uladzislau Rezki
Baoquan He Sept. 12, 2023, 1:42 p.m. UTC | #3
On 09/11/23 at 08:16pm, Uladzislau Rezki wrote:
> On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > Extend the vread_iter() to be able to perform a sequential
> > > reading of VAs which are spread among multiple nodes. So a
> > > data read over the /dev/kmem correctly reflects a vmalloc
> > > memory layout.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > ---
> > >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> > >  1 file changed, 53 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index 4fd4915c532d..968144c16237 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > ......  
> > > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > >  
> > >  	remains = count;
> > >  
> > > -	/* Hooked to node_0 so far. */
> > > -	vn = addr_to_node(0);
> > > -	spin_lock(&vn->busy.lock);
> > 
> > This could change the vread behaviour a little bit. Before, once we take
> > vmap_area_lock, the vread will read out the content of snapshot at the
> > moment. Now, reading out in one node's tree won't disrupt other nodes'
> > tree accessing. Not sure if this matters when people need access
> > /proc/kcore, e.g dynamic debugging.
> >
> With one big tree you anyway drop the lock after one cycle of reading.
> As far as i see, kcore.c's read granularity is a PAGE_SIZE.

With my understanding, kcore reading on vmalloc does read page by page,
it will continue after one page reading if the required size is bigger
than one page. Please see aligned_vread_iter() code. During the complete
process, vmap_area_lock is held before this patch.

> 
> > 
> > And, the reading will be a little slower because each va finding need
> > iterate all vmap_nodes[].
> > 
> Right. It is a bit tough here, because we have multiple nodes which
> represent zones(address space), i.e. there is an offset between them,
> it means that, reading fully one tree, will not provide a sequential
> reading.

Understood. Suppose the kcore reading on vmalloc is not critical. If I
get chance to test on a machine with 256 cpu, I will report here.
Baoquan He Sept. 13, 2023, 10:59 a.m. UTC | #4
On 09/11/23 at 08:16pm, Uladzislau Rezki wrote:
> On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > Extend the vread_iter() to be able to perform a sequential
> > > reading of VAs which are spread among multiple nodes. So a
> > > data read over the /dev/kmem correctly reflects a vmalloc
> > > memory layout.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > ---
> > >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> > >  1 file changed, 53 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index 4fd4915c532d..968144c16237 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > ......  
> > > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > >  
> > >  	remains = count;
> > >  
> > > -	/* Hooked to node_0 so far. */
> > > -	vn = addr_to_node(0);
> > > -	spin_lock(&vn->busy.lock);
> > 
> > This could change the vread behaviour a little bit. Before, once we take
> > vmap_area_lock, the vread will read out the content of snapshot at the
> > moment. Now, reading out in one node's tree won't disrupt other nodes'
> > tree accessing. Not sure if this matters when people need access
> > /proc/kcore, e.g dynamic debugging.
> >
> With one big tree you anyway drop the lock after one cycle of reading.
> As far as i see, kcore.c's read granularity is a PAGE_SIZE.

You are right, kcore.c's reading granularity is truly PAGE_SIZE.
I don't know procfs well, still need to study the code. Then it doesn't
matter much with the multiple nodes in vread_iter(). Sorry for the noise.

static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{  
	......
        start = kc_offset_to_vaddr(*fpos - data_offset);
        if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
                tsz = buflen;

	m = NULL;
        while (buflen) {
	}
	...
}
Uladzislau Rezki Sept. 13, 2023, 3:38 p.m. UTC | #5
On Wed, Sep 13, 2023 at 06:59:42PM +0800, Baoquan He wrote:
> On 09/11/23 at 08:16pm, Uladzislau Rezki wrote:
> > On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> > > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > > Extend the vread_iter() to be able to perform a sequential
> > > > reading of VAs which are spread among multiple nodes. So a
> > > > data read over the /dev/kmem correctly reflects a vmalloc
> > > > memory layout.
> > > > 
> > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > ---
> > > >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> > > >  1 file changed, 53 insertions(+), 14 deletions(-)
> > > > 
> > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > index 4fd4915c532d..968144c16237 100644
> > > > --- a/mm/vmalloc.c
> > > > +++ b/mm/vmalloc.c
> > > ......  
> > > > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > > >  
> > > >  	remains = count;
> > > >  
> > > > -	/* Hooked to node_0 so far. */
> > > > -	vn = addr_to_node(0);
> > > > -	spin_lock(&vn->busy.lock);
> > > 
> > > This could change the vread behaviour a little bit. Before, once we take
> > > vmap_area_lock, the vread will read out the content of snapshot at the
> > > moment. Now, reading out in one node's tree won't disrupt other nodes'
> > > tree accessing. Not sure if this matters when people need access
> > > /proc/kcore, e.g dynamic debugging.
> > >
> > With one big tree you anyway drop the lock after one cycle of reading.
> > As far as i see, kcore.c's read granularity is a PAGE_SIZE.
> 
> You are right, kcore.c's reading granularity is truly PAGE_SIZE.
> I don't know procfs well, still need to study the code. Then it doesn't
> matter much with the multiple nodes in vread_iter(). Sorry for the noise.
> 
> static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> {  
> 	......
>         start = kc_offset_to_vaddr(*fpos - data_offset);
>         if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
>                 tsz = buflen;
> 
> 	m = NULL;
>         while (buflen) {
> 	}
> 	...
> }
> 
Good. Then we are on the same page :)

Thank you!

--
Uladzislau Rezki
Uladzislau Rezki Sept. 13, 2023, 3:42 p.m. UTC | #6
On Tue, Sep 12, 2023 at 09:42:32PM +0800, Baoquan He wrote:
> On 09/11/23 at 08:16pm, Uladzislau Rezki wrote:
> > On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> > > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > > Extend the vread_iter() to be able to perform a sequential
> > > > reading of VAs which are spread among multiple nodes. So a
> > > > data read over the /dev/kmem correctly reflects a vmalloc
> > > > memory layout.
> > > > 
> > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > ---
> > > >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> > > >  1 file changed, 53 insertions(+), 14 deletions(-)
> > > > 
> > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > index 4fd4915c532d..968144c16237 100644
> > > > --- a/mm/vmalloc.c
> > > > +++ b/mm/vmalloc.c
> > > ......  
> > > > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > > >  
> > > >  	remains = count;
> > > >  
> > > > -	/* Hooked to node_0 so far. */
> > > > -	vn = addr_to_node(0);
> > > > -	spin_lock(&vn->busy.lock);
> > > 
> > > This could change the vread behaviour a little bit. Before, once we take
> > > vmap_area_lock, the vread will read out the content of snapshot at the
> > > moment. Now, reading out in one node's tree won't disrupt other nodes'
> > > tree accessing. Not sure if this matters when people need access
> > > /proc/kcore, e.g dynamic debugging.
> > >
> > With one big tree you anyway drop the lock after one cycle of reading.
> > As far as i see, kcore.c's read granularity is a PAGE_SIZE.
> 
> With my understanding, kcore reading on vmalloc does read page by page,
> it will continue after one page reading if the required size is bigger
> than one page. Please see aligned_vread_iter() code. During the complete
> process, vmap_area_lock is held before this patch.
> 
> > 
> > > 
> > > And, the reading will be a little slower because each va finding need
> > > iterate all vmap_nodes[].
> > > 
> > Right. It is a bit tough here, because we have multiple nodes which
> > represent zones(address space), i.e. there is an offset between them,
> > it means that, reading fully one tree, will not provide a sequential
> > reading.
> 
> Understood. Suppose the kcore reading on vmalloc is not critical. If I
> get chance to test on a machine with 256 cpu, I will report here.
> 
It would be great! Unfortunately i do not have an access to such big
systems. What i have is 64 CPUs max system. If you, by chance can test
on bigger systems or can provide a temporary ssh access that would be
awesome.

--
Uladzislau Rezki
Baoquan He Sept. 14, 2023, 3:02 a.m. UTC | #7
On 09/13/23 at 05:42pm, Uladzislau Rezki wrote:
> On Tue, Sep 12, 2023 at 09:42:32PM +0800, Baoquan He wrote:
> > On 09/11/23 at 08:16pm, Uladzislau Rezki wrote:
> > > On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> > > > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > > > Extend the vread_iter() to be able to perform a sequential
> > > > > reading of VAs which are spread among multiple nodes. So a
> > > > > data read over the /dev/kmem correctly reflects a vmalloc
> > > > > memory layout.
> > > > > 
> > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > ---
> > > > >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> > > > >  1 file changed, 53 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > > index 4fd4915c532d..968144c16237 100644
> > > > > --- a/mm/vmalloc.c
> > > > > +++ b/mm/vmalloc.c
> > > > ......  
> > > > > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > > > >  
> > > > >  	remains = count;
> > > > >  
> > > > > -	/* Hooked to node_0 so far. */
> > > > > -	vn = addr_to_node(0);
> > > > > -	spin_lock(&vn->busy.lock);
> > > > 
> > > > This could change the vread behaviour a little bit. Before, once we take
> > > > vmap_area_lock, the vread will read out the content of snapshot at the
> > > > moment. Now, reading out in one node's tree won't disrupt other nodes'
> > > > tree accessing. Not sure if this matters when people need access
> > > > /proc/kcore, e.g dynamic debugging.
> > > >
> > > With one big tree you anyway drop the lock after one cycle of reading.
> > > As far as i see, kcore.c's read granularity is a PAGE_SIZE.
> > 
> > With my understanding, kcore reading on vmalloc does read page by page,
> > it will continue after one page reading if the required size is bigger
> > than one page. Please see aligned_vread_iter() code. During the complete
> > process, vmap_area_lock is held before this patch.
> > 
> > > 
> > > > 
> > > > And, the reading will be a little slower because each va finding need
> > > > iterate all vmap_nodes[].
> > > > 
> > > Right. It is a bit tough here, because we have multiple nodes which
> > > represent zones(address space), i.e. there is an offset between them,
> > > it means that, reading fully one tree, will not provide a sequential
> > > reading.
> > 
> > Understood. Suppose the kcore reading on vmalloc is not critical. If I
> > get chance to test on a machine with 256 cpu, I will report here.
> > 
> It would be great! Unfortunately i do not have an access to such big
> systems. What i have is 64 CPUs max system. If you, by chance can test
> on bigger systems or can provide a temporary ssh access that would be
> awesome.

I got one with 288 cpus, have sent you ip address in private mail.
Baoquan He Sept. 14, 2023, 3:36 a.m. UTC | #8
On 09/13/23 at 05:42pm, Uladzislau Rezki wrote:
> On Tue, Sep 12, 2023 at 09:42:32PM +0800, Baoquan He wrote:
> > On 09/11/23 at 08:16pm, Uladzislau Rezki wrote:
> > > On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> > > > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > > > Extend the vread_iter() to be able to perform a sequential
> > > > > reading of VAs which are spread among multiple nodes. So a
> > > > > data read over the /dev/kmem correctly reflects a vmalloc
> > > > > memory layout.
> > > > > 
> > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > ---
> > > > >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> > > > >  1 file changed, 53 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > > index 4fd4915c532d..968144c16237 100644
> > > > > --- a/mm/vmalloc.c
> > > > > +++ b/mm/vmalloc.c
> > > > ......  
> > > > > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > > > >  
> > > > >  	remains = count;
> > > > >  
> > > > > -	/* Hooked to node_0 so far. */
> > > > > -	vn = addr_to_node(0);
> > > > > -	spin_lock(&vn->busy.lock);
> > > > 
> > > > This could change the vread behaviour a little bit. Before, once we take
> > > > vmap_area_lock, the vread will read out the content of snapshot at the
> > > > moment. Now, reading out in one node's tree won't disrupt other nodes'
> > > > tree accessing. Not sure if this matters when people need access
> > > > /proc/kcore, e.g dynamic debugging.
> > > >
> > > With one big tree you anyway drop the lock after one cycle of reading.
> > > As far as i see, kcore.c's read granularity is a PAGE_SIZE.
> > 
> > With my understanding, kcore reading on vmalloc does read page by page,
> > it will continue after one page reading if the required size is bigger
> > than one page. Please see aligned_vread_iter() code. During the complete
> > process, vmap_area_lock is held before this patch.
> > 
> > > 
> > > > 
> > > > And, the reading will be a little slower because each va finding need
> > > > iterate all vmap_nodes[].
> > > > 
> > > Right. It is a bit tough here, because we have multiple nodes which
> > > represent zones(address space), i.e. there is an offset between them,
> > > it means that, reading fully one tree, will not provide a sequential
> > > reading.
> > 
> > Understood. Suppose the kcore reading on vmalloc is not critical. If I
> > get chance to test on a machine with 256 cpu, I will report here.
> > 
> It would be great! Unfortunately i do not have an access to such big
> systems. What i have is 64 CPUs max system. If you, by chance can test
> on bigger systems or can provide a temporary ssh access that would be
> awesome.

10.16.216.205
user:root
password:redhat

This is a testing server in our lab, we apply for usage each time and it
will reinstall OS, root user should be OK. I will take it for two days.

If accessing is not available, I can do some testing if you want me to
run some commands.
Baoquan He Sept. 14, 2023, 3:38 a.m. UTC | #9
On 09/14/23 at 11:36am, Baoquan He wrote:
> On 09/13/23 at 05:42pm, Uladzislau Rezki wrote:
> > On Tue, Sep 12, 2023 at 09:42:32PM +0800, Baoquan He wrote:
> > > On 09/11/23 at 08:16pm, Uladzislau Rezki wrote:
> > > > On Mon, Sep 11, 2023 at 11:58:13AM +0800, Baoquan He wrote:
> > > > > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > > > > Extend the vread_iter() to be able to perform a sequential
> > > > > > reading of VAs which are spread among multiple nodes. So a
> > > > > > data read over the /dev/kmem correctly reflects a vmalloc
> > > > > > memory layout.
> > > > > > 
> > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > > ---
> > > > > >  mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
> > > > > >  1 file changed, 53 insertions(+), 14 deletions(-)
> > > > > > 
> > > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > > > index 4fd4915c532d..968144c16237 100644
> > > > > > --- a/mm/vmalloc.c
> > > > > > +++ b/mm/vmalloc.c
> > > > > ......  
> > > > > > @@ -4057,19 +4093,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > > > > >  
> > > > > >  	remains = count;
> > > > > >  
> > > > > > -	/* Hooked to node_0 so far. */
> > > > > > -	vn = addr_to_node(0);
> > > > > > -	spin_lock(&vn->busy.lock);
> > > > > 
> > > > > This could change the vread behaviour a little bit. Before, once we take
> > > > > vmap_area_lock, the vread will read out the content of snapshot at the
> > > > > moment. Now, reading out in one node's tree won't disrupt other nodes'
> > > > > tree accessing. Not sure if this matters when people need access
> > > > > /proc/kcore, e.g dynamic debugging.
> > > > >
> > > > With one big tree you anyway drop the lock after one cycle of reading.
> > > > As far as i see, kcore.c's read granularity is a PAGE_SIZE.
> > > 
> > > With my understanding, kcore reading on vmalloc does read page by page,
> > > it will continue after one page reading if the required size is bigger
> > > than one page. Please see aligned_vread_iter() code. During the complete
> > > process, vmap_area_lock is held before this patch.
> > > 
> > > > 
> > > > > 
> > > > > And, the reading will be a little slower because each va finding need
> > > > > iterate all vmap_nodes[].
> > > > > 
> > > > Right. It is a bit tough here, because we have multiple nodes which
> > > > represent zones(address space), i.e. there is an offset between them,
> > > > it means that, reading fully one tree, will not provide a sequential
> > > > reading.
> > > 
> > > Understood. Suppose the kcore reading on vmalloc is not critical. If I
> > > get chance to test on a machine with 256 cpu, I will report here.
> > > 
> > It would be great! Unfortunately i do not have an access to such big
> > systems. What i have is 64 CPUs max system. If you, by chance can test
> > on bigger systems or can provide a temporary ssh access that would be
> > awesome.
> 
> 10.16.216.205
> user:root
> password:redhat
> 
> This is a testing server in our lab, we apply for usage each time and it
> will reinstall OS, root user should be OK. I will take it for two days.

Oops, I sent it out publicly.

> 
> If accessing is not available, I can do some testing if you want me to
> run some commands.
>
diff mbox series

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4fd4915c532d..968144c16237 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -870,7 +870,7 @@  unsigned long vmalloc_nr_pages(void)
 
 /* Look up the first VA which satisfies addr < va_end, NULL if none. */
 static struct vmap_area *
-find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
+__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 {
 	struct vmap_area *va = NULL;
 	struct rb_node *n = root->rb_node;
@@ -894,6 +894,41 @@  find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 	return va;
 }
 
+/*
+ * Returns a node where a first VA, that satisfies addr < va_end, resides.
+ * If success, a node is locked. A user is responsible to unlock it when a
+ * VA is no longer needed to be accessed.
+ *
+ * Returns NULL if nothing found.
+ */
+static struct vmap_node *
+find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
+{
+	struct vmap_node *vn, *va_node = NULL;
+	struct vmap_area *va_lowest;
+	int i;
+
+	for (i = 0; i < nr_nodes; i++) {
+		vn = &nodes[i];
+
+		spin_lock(&vn->busy.lock);
+		va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
+		if (va_lowest) {
+			if (!va_node || va_lowest->va_start < (*va)->va_start) {
+				if (va_node)
+					spin_unlock(&va_node->busy.lock);
+
+				*va = va_lowest;
+				va_node = vn;
+				continue;
+			}
+		}
+		spin_unlock(&vn->busy.lock);
+	}
+
+	return va_node;
+}
+
 static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
 {
 	struct rb_node *n = root->rb_node;
@@ -4048,6 +4083,7 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	struct vm_struct *vm;
 	char *vaddr;
 	size_t n, size, flags, remains;
+	unsigned long next;
 
 	addr = kasan_reset_tag(addr);
 
@@ -4057,19 +4093,15 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 	remains = count;
 
-	/* Hooked to node_0 so far. */
-	vn = addr_to_node(0);
-	spin_lock(&vn->busy.lock);
-
-	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
-	if (!va)
+	vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
+	if (!vn)
 		goto finished_zero;
 
 	/* no intersects with alive vmap_area */
 	if ((unsigned long)addr + remains <= va->va_start)
 		goto finished_zero;
 
-	list_for_each_entry_from(va, &vn->busy.head, list) {
+	do {
 		size_t copied;
 
 		if (remains == 0)
@@ -4084,10 +4116,10 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 		WARN_ON(flags == VMAP_BLOCK);
 
 		if (!vm && !flags)
-			continue;
+			goto next_va;
 
 		if (vm && (vm->flags & VM_UNINITIALIZED))
-			continue;
+			goto next_va;
 
 		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
 		smp_rmb();
@@ -4096,7 +4128,7 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 		size = vm ? get_vm_area_size(vm) : va_size(va);
 
 		if (addr >= vaddr + size)
-			continue;
+			goto next_va;
 
 		if (addr < vaddr) {
 			size_t to_zero = min_t(size_t, vaddr - addr, remains);
@@ -4125,15 +4157,22 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 		if (copied != n)
 			goto finished;
-	}
+
+	next_va:
+		next = va->va_end;
+		spin_unlock(&vn->busy.lock);
+	} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
 
 finished_zero:
-	spin_unlock(&vn->busy.lock);
+	if (vn)
+		spin_unlock(&vn->busy.lock);
+
 	/* zero-fill memory holes */
 	return count - remains + zero_iter(iter, remains);
 finished:
 	/* Nothing remains, or We couldn't copy/zero everything. */
-	spin_unlock(&vn->busy.lock);
+	if (vn)
+		spin_unlock(&vn->busy.lock);
 
 	return count - remains;
 }