diff mbox series

[v3,1/2] mm, sparse: drop pgdat_resize_lock in sparse_add/remove_one_section()

Message ID 20181129155316.8174-1-richard.weiyang@gmail.com (mailing list archive)
State New, archived
Headers show
Series [v3,1/2] mm, sparse: drop pgdat_resize_lock in sparse_add/remove_one_section() | expand

Commit Message

Wei Yang Nov. 29, 2018, 3:53 p.m. UTC
pgdat_resize_lock is used to protect pgdat's memory region information
like: node_start_pfn, node_present_pages, etc. While in function
sparse_add/remove_one_section(), pgdat_resize_lock is used to protect
initialization/release of one mem_section. This looks not proper.

Based on current implementation, even remove this lock, mem_section
is still away from contention, because it is protected by global
mem_hotpulg_lock.

Following is the current call trace of sparse_add/remove_one_section()

    mem_hotplug_begin()
    arch_add_memory()
       add_pages()
           __add_pages()
               __add_section()
                   sparse_add_one_section()
    mem_hotplug_done()

    mem_hotplug_begin()
    arch_remove_memory()
        __remove_pages()
            __remove_section()
                sparse_remove_one_section()
    mem_hotplug_done()

The comment above the pgdat_resize_lock also mentions "Holding this will
also guarantee that any pfn_valid() stays that way.", which is true with
the current implementation and false after this patch. But current
implementation doesn't meet this comment. There isn't any pfn walkers
to take the lock so this looks like a relict from the past. This patch
also removes this comment.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>

---
v3:
   * adjust the changelog with the reason for this change
   * remove a comment for pgdat_resize_lock
   * separate the prototype change of sparse_add_one_section() to
     another one
v2:
   * adjust changelog to show this procedure is serialized by global
     mem_hotplug_lock
---
 include/linux/mmzone.h | 2 --
 mm/sparse.c            | 9 +--------
 2 files changed, 1 insertion(+), 10 deletions(-)

Comments

David Hildenbrand Nov. 29, 2018, 4:06 p.m. UTC | #1
On 29.11.18 16:53, Wei Yang wrote:
> pgdat_resize_lock is used to protect pgdat's memory region information
> like: node_start_pfn, node_present_pages, etc. While in function
> sparse_add/remove_one_section(), pgdat_resize_lock is used to protect
> initialization/release of one mem_section. This looks not proper.
> 
> Based on current implementation, even remove this lock, mem_section
> is still away from contention, because it is protected by global
> mem_hotpulg_lock.

s/mem_hotpulg_lock/mem_hotplug_lock/

> 
> Following is the current call trace of sparse_add/remove_one_section()
> 
>     mem_hotplug_begin()
>     arch_add_memory()
>        add_pages()
>            __add_pages()
>                __add_section()
>                    sparse_add_one_section()
>     mem_hotplug_done()
> 
>     mem_hotplug_begin()
>     arch_remove_memory()
>         __remove_pages()
>             __remove_section()
>                 sparse_remove_one_section()
>     mem_hotplug_done()
> 
> The comment above the pgdat_resize_lock also mentions "Holding this will
> also guarantee that any pfn_valid() stays that way.", which is true with
> the current implementation and false after this patch. But current
> implementation doesn't meet this comment. There isn't any pfn walkers
> to take the lock so this looks like a relict from the past. This patch
> also removes this comment.

Should we start to document which lock is expected to protect what?

I suggest adding what you just found out to
Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
Maybe a new subsection for mem_hotplug_lock. And eventually also
pgdat_resize_lock.

> 
> Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
> 
> ---
> v3:
>    * adjust the changelog with the reason for this change
>    * remove a comment for pgdat_resize_lock
>    * separate the prototype change of sparse_add_one_section() to
>      another one
> v2:
>    * adjust changelog to show this procedure is serialized by global
>      mem_hotplug_lock
> ---
>  include/linux/mmzone.h | 2 --
>  mm/sparse.c            | 9 +--------
>  2 files changed, 1 insertion(+), 10 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 1bb749bee284..0a66085d7ced 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -638,8 +638,6 @@ typedef struct pglist_data {
>  	/*
>  	 * Must be held any time you expect node_start_pfn,
>  	 * node_present_pages, node_spanned_pages or nr_zones stay constant.
> -	 * Holding this will also guarantee that any pfn_valid() stays that
> -	 * way.
>  	 *
>  	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
>  	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 33307fc05c4d..5825f276485f 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -669,7 +669,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
>  	struct mem_section *ms;
>  	struct page *memmap;
>  	unsigned long *usemap;
> -	unsigned long flags;
>  	int ret;
>  
>  	/*
> @@ -689,8 +688,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
>  		return -ENOMEM;
>  	}
>  
> -	pgdat_resize_lock(pgdat, &flags);
> -
>  	ms = __pfn_to_section(start_pfn);
>  	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
>  		ret = -EEXIST;
> @@ -707,7 +704,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
>  	sparse_init_one_section(ms, section_nr, memmap, usemap);
>  
>  out:
> -	pgdat_resize_unlock(pgdat, &flags);
>  	if (ret < 0) {
>  		kfree(usemap);
>  		__kfree_section_memmap(memmap, altmap);
> @@ -769,10 +765,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
>  		unsigned long map_offset, struct vmem_altmap *altmap)
>  {
>  	struct page *memmap = NULL;
> -	unsigned long *usemap = NULL, flags;
> -	struct pglist_data *pgdat = zone->zone_pgdat;
> +	unsigned long *usemap = NULL;
>  
> -	pgdat_resize_lock(pgdat, &flags);
>  	if (ms->section_mem_map) {
>  		usemap = ms->pageblock_flags;
>  		memmap = sparse_decode_mem_map(ms->section_mem_map,
> @@ -780,7 +774,6 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
>  		ms->section_mem_map = 0;
>  		ms->pageblock_flags = NULL;
>  	}
> -	pgdat_resize_unlock(pgdat, &flags);
>  
>  	clear_hwpoisoned_pages(memmap + map_offset,
>  			PAGES_PER_SECTION - map_offset);
>
Michal Hocko Nov. 29, 2018, 5:14 p.m. UTC | #2
On Thu 29-11-18 23:53:15, Wei Yang wrote:
> pgdat_resize_lock is used to protect pgdat's memory region information
> like: node_start_pfn, node_present_pages, etc. While in function
> sparse_add/remove_one_section(), pgdat_resize_lock is used to protect
> initialization/release of one mem_section. This looks not proper.
> 
> Based on current implementation, even remove this lock, mem_section
> is still away from contention, because it is protected by global
> mem_hotpulg_lock.

I guess you wanted to say.
These code paths are currently protected by mem_hotpulg_lock currently
but should there ever be any reason for locking at the sparse layer a
dedicated lock should be introduced.

> 
> Following is the current call trace of sparse_add/remove_one_section()
> 
>     mem_hotplug_begin()
>     arch_add_memory()
>        add_pages()
>            __add_pages()
>                __add_section()
>                    sparse_add_one_section()
>     mem_hotplug_done()
> 
>     mem_hotplug_begin()
>     arch_remove_memory()
>         __remove_pages()
>             __remove_section()
>                 sparse_remove_one_section()
>     mem_hotplug_done()
> 
> The comment above the pgdat_resize_lock also mentions "Holding this will
> also guarantee that any pfn_valid() stays that way.", which is true with
> the current implementation and false after this patch. But current
> implementation doesn't meet this comment. There isn't any pfn walkers
> to take the lock so this looks like a relict from the past. This patch
> also removes this comment.
> 
> Signed-off-by: Wei Yang <richard.weiyang@gmail.com>

Other than that
Acked-by: Michal Hocko <mhocko@suse.com>

> 
> ---
> v3:
>    * adjust the changelog with the reason for this change
>    * remove a comment for pgdat_resize_lock
>    * separate the prototype change of sparse_add_one_section() to
>      another one
> v2:
>    * adjust changelog to show this procedure is serialized by global
>      mem_hotplug_lock
> ---
>  include/linux/mmzone.h | 2 --
>  mm/sparse.c            | 9 +--------
>  2 files changed, 1 insertion(+), 10 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 1bb749bee284..0a66085d7ced 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -638,8 +638,6 @@ typedef struct pglist_data {
>  	/*
>  	 * Must be held any time you expect node_start_pfn,
>  	 * node_present_pages, node_spanned_pages or nr_zones stay constant.
> -	 * Holding this will also guarantee that any pfn_valid() stays that
> -	 * way.
>  	 *
>  	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
>  	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 33307fc05c4d..5825f276485f 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -669,7 +669,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
>  	struct mem_section *ms;
>  	struct page *memmap;
>  	unsigned long *usemap;
> -	unsigned long flags;
>  	int ret;
>  
>  	/*
> @@ -689,8 +688,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
>  		return -ENOMEM;
>  	}
>  
> -	pgdat_resize_lock(pgdat, &flags);
> -
>  	ms = __pfn_to_section(start_pfn);
>  	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
>  		ret = -EEXIST;
> @@ -707,7 +704,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
>  	sparse_init_one_section(ms, section_nr, memmap, usemap);
>  
>  out:
> -	pgdat_resize_unlock(pgdat, &flags);
>  	if (ret < 0) {
>  		kfree(usemap);
>  		__kfree_section_memmap(memmap, altmap);
> @@ -769,10 +765,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
>  		unsigned long map_offset, struct vmem_altmap *altmap)
>  {
>  	struct page *memmap = NULL;
> -	unsigned long *usemap = NULL, flags;
> -	struct pglist_data *pgdat = zone->zone_pgdat;
> +	unsigned long *usemap = NULL;
>  
> -	pgdat_resize_lock(pgdat, &flags);
>  	if (ms->section_mem_map) {
>  		usemap = ms->pageblock_flags;
>  		memmap = sparse_decode_mem_map(ms->section_mem_map,
> @@ -780,7 +774,6 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
>  		ms->section_mem_map = 0;
>  		ms->pageblock_flags = NULL;
>  	}
> -	pgdat_resize_unlock(pgdat, &flags);
>  
>  	clear_hwpoisoned_pages(memmap + map_offset,
>  			PAGES_PER_SECTION - map_offset);
> -- 
> 2.15.1
>
Michal Hocko Nov. 29, 2018, 5:17 p.m. UTC | #3
On Thu 29-11-18 17:06:15, David Hildenbrand wrote:
> I suggest adding what you just found out to
[...]
> Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
> Maybe a new subsection for mem_hotplug_lock. And eventually also
> pgdat_resize_lock.

That would be really great! I guess I have suggested something like that
to Oscar already and he provided a highlevel overview.
Wei Yang Nov. 30, 2018, 4:28 a.m. UTC | #4
On Thu, Nov 29, 2018 at 05:06:15PM +0100, David Hildenbrand wrote:
>On 29.11.18 16:53, Wei Yang wrote:
>> pgdat_resize_lock is used to protect pgdat's memory region information
>> like: node_start_pfn, node_present_pages, etc. While in function
>> sparse_add/remove_one_section(), pgdat_resize_lock is used to protect
>> initialization/release of one mem_section. This looks not proper.
>> 
>> Based on current implementation, even remove this lock, mem_section
>> is still away from contention, because it is protected by global
>> mem_hotpulg_lock.
>
>s/mem_hotpulg_lock/mem_hotplug_lock/
>
>> 
>> Following is the current call trace of sparse_add/remove_one_section()
>> 
>>     mem_hotplug_begin()
>>     arch_add_memory()
>>        add_pages()
>>            __add_pages()
>>                __add_section()
>>                    sparse_add_one_section()
>>     mem_hotplug_done()
>> 
>>     mem_hotplug_begin()
>>     arch_remove_memory()
>>         __remove_pages()
>>             __remove_section()
>>                 sparse_remove_one_section()
>>     mem_hotplug_done()
>> 
>> The comment above the pgdat_resize_lock also mentions "Holding this will
>> also guarantee that any pfn_valid() stays that way.", which is true with
>> the current implementation and false after this patch. But current
>> implementation doesn't meet this comment. There isn't any pfn walkers
>> to take the lock so this looks like a relict from the past. This patch
>> also removes this comment.
>
>Should we start to document which lock is expected to protect what?
>
>I suggest adding what you just found out to
>Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
>Maybe a new subsection for mem_hotplug_lock. And eventually also
>pgdat_resize_lock.

Well, I am not good at document writting. Below is my first trial.  Look
forward your comments.

BTW, in case I would send a new version with this, would I put this into
a separate one or merge this into current one?

diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 5c4432c96c4b..1548820a0762 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -396,6 +396,20 @@ Need more implementation yet....
 Locking Internals
 =================
 
+There are three locks involved in memory-hotplug, two global lock and one local
+lock:
+
+- device_hotplug_lock
+- mem_hotplug_lock
+- device_lock
+
+Currently, they are twisted together for all kinds of reasons. The following
+part is divded into device_hotplug_lock and mem_hotplug_lock parts
+respectively to describe those tricky situations.
+
+device_hotplug_lock
+---------------------
+
 When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
 the device_hotplug_lock should be held to:
 
@@ -417,14 +431,21 @@ memory faster than expected:
 As the device is visible to user space before taking the device_lock(), this
 can result in a lock inversion.
 
+mem_hotplug_lock
+---------------------
+
 onlining/offlining of memory should be done via device_online()/
-device_offline() - to make sure it is properly synchronized to actions
-via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
+device_offline() - to make sure it is properly synchronized to actions via
+sysfs. Even mem_hotplug_lock is used to protect the process, because of the
+lock inversion described above, holding device_hotplug_lock is still advised
+(to e.g. protect online_type)
 
 When adding/removing/onlining/offlining memory or adding/removing
 heterogeneous/device memory, we should always hold the mem_hotplug_lock in
 write mode to serialise memory hotplug (e.g. access to global/zone
-variables).
+variables). Currently, we take advantage of this to serialise sparsemem's
+mem_section handling in sparse_add_one_section() and
+sparse_remove_one_section().
 
 In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
 mode allows for a quite efficient get_online_mems/put_online_mems

>
>
>Thanks,
>
>David / dhildenb
David Hildenbrand Nov. 30, 2018, 9:19 a.m. UTC | #5
>> I suggest adding what you just found out to
>> Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
>> Maybe a new subsection for mem_hotplug_lock. And eventually also
>> pgdat_resize_lock.
> 
> Well, I am not good at document writting. Below is my first trial.  Look
> forward your comments.

I'll have a look, maybe also Oscar and Michal can have a look. I guess
we don't have to cover all now, we can add more details as we discover them.

> 
> BTW, in case I would send a new version with this, would I put this into
> a separate one or merge this into current one?

I would put this into a separate patch.

> 
> diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
> index 5c4432c96c4b..1548820a0762 100644
> --- a/Documentation/admin-guide/mm/memory-hotplug.rst
> +++ b/Documentation/admin-guide/mm/memory-hotplug.rst
> @@ -396,6 +396,20 @@ Need more implementation yet....
>  Locking Internals
>  =================
>  
> +There are three locks involved in memory-hotplug, two global lock and one local
> +lock:
> +
> +- device_hotplug_lock
> +- mem_hotplug_lock
> +- device_lock
> +
> +Currently, they are twisted together for all kinds of reasons. The following
> +part is divded into device_hotplug_lock and mem_hotplug_lock parts

s/divded/divided/

> +respectively to describe those tricky situations.
> +
> +device_hotplug_lock
> +---------------------
> +
>  When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
>  the device_hotplug_lock should be held to:
>  
> @@ -417,14 +431,21 @@ memory faster than expected:
>  As the device is visible to user space before taking the device_lock(), this
>  can result in a lock inversion.
>  
> +mem_hotplug_lock
> +---------------------
> +

I would this section start after the following paragraph, as most of
that paragraph belongs to the device_hotplug_lock.


>  onlining/offlining of memory should be done via device_online()/
> -device_offline() - to make sure it is properly synchronized to actions
> -via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
> +device_offline() - to make sure it is properly synchronized to actions via
> +sysfs. Even mem_hotplug_lock is used to protect the process, because of the
> +lock inversion described above, holding device_hotplug_lock is still advised
> +(to e.g. protect online_type)
>  
>  When adding/removing/onlining/offlining memory or adding/removing
>  heterogeneous/device memory, we should always hold the mem_hotplug_lock in
>  write mode to serialise memory hotplug (e.g. access to global/zone
> -variables).
> +variables). Currently, we take advantage of this to serialise sparsemem's
> +mem_section handling in sparse_add_one_section() and
> +sparse_remove_one_section().
>  
>  In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
>  mode allows for a quite efficient get_online_mems/put_online_mems
> 
>>
>>
>> Thanks,
>>
>> David / dhildenb
> 

Apart from that looks good to me, thanks!
Michal Hocko Nov. 30, 2018, 9:52 a.m. UTC | #6
On Fri 30-11-18 10:19:13, David Hildenbrand wrote:
> >> I suggest adding what you just found out to
> >> Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
> >> Maybe a new subsection for mem_hotplug_lock. And eventually also
> >> pgdat_resize_lock.
> > 
> > Well, I am not good at document writting. Below is my first trial.  Look
> > forward your comments.
> 
> I'll have a look, maybe also Oscar and Michal can have a look. I guess
> we don't have to cover all now, we can add more details as we discover them.

Oscar, didn't you have something already?
Wei Yang Dec. 1, 2018, 12:31 a.m. UTC | #7
On Fri, Nov 30, 2018 at 10:19:13AM +0100, David Hildenbrand wrote:
>>> I suggest adding what you just found out to
>>> Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
>>> Maybe a new subsection for mem_hotplug_lock. And eventually also
>>> pgdat_resize_lock.
>> 
>> Well, I am not good at document writting. Below is my first trial.  Look
>> forward your comments.
>
>I'll have a look, maybe also Oscar and Michal can have a look. I guess
>we don't have to cover all now, we can add more details as we discover them.
>
>> 
>> BTW, in case I would send a new version with this, would I put this into
>> a separate one or merge this into current one?
>
>I would put this into a separate patch.
>
>> 
>> diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
>> index 5c4432c96c4b..1548820a0762 100644
>> --- a/Documentation/admin-guide/mm/memory-hotplug.rst
>> +++ b/Documentation/admin-guide/mm/memory-hotplug.rst
>> @@ -396,6 +396,20 @@ Need more implementation yet....
>>  Locking Internals
>>  =================
>>  
>> +There are three locks involved in memory-hotplug, two global lock and one local
>> +lock:
>> +
>> +- device_hotplug_lock
>> +- mem_hotplug_lock
>> +- device_lock
>> +
>> +Currently, they are twisted together for all kinds of reasons. The following
>> +part is divded into device_hotplug_lock and mem_hotplug_lock parts
>
>s/divded/divided/
>
>> +respectively to describe those tricky situations.
>> +
>> +device_hotplug_lock
>> +---------------------
>> +
>>  When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
>>  the device_hotplug_lock should be held to:
>>  
>> @@ -417,14 +431,21 @@ memory faster than expected:
>>  As the device is visible to user space before taking the device_lock(), this
>>  can result in a lock inversion.
>>  
>> +mem_hotplug_lock
>> +---------------------
>> +
>
>I would this section start after the following paragraph, as most of
>that paragraph belongs to the device_hotplug_lock.
>
>
>>  onlining/offlining of memory should be done via device_online()/
>> -device_offline() - to make sure it is properly synchronized to actions
>> -via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
>> +device_offline() - to make sure it is properly synchronized to actions via
>> +sysfs. Even mem_hotplug_lock is used to protect the process, because of the
>> +lock inversion described above, holding device_hotplug_lock is still advised
>> +(to e.g. protect online_type)
>>  
>>  When adding/removing/onlining/offlining memory or adding/removing
>>  heterogeneous/device memory, we should always hold the mem_hotplug_lock in
>>  write mode to serialise memory hotplug (e.g. access to global/zone
>> -variables).
>> +variables). Currently, we take advantage of this to serialise sparsemem's
>> +mem_section handling in sparse_add_one_section() and
>> +sparse_remove_one_section().
>>  
>>  In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
>>  mode allows for a quite efficient get_online_mems/put_online_mems
>> 
>>>
>>>
>>> Thanks,
>>>
>>> David / dhildenb
>> 
>
>Apart from that looks good to me, thanks!
>

Thanks :-)

>
>-- 
>
>Thanks,
>
>David / dhildenb
David Hildenbrand Dec. 3, 2018, 11:25 a.m. UTC | #8
On 30.11.18 05:28, Wei Yang wrote:
> On Thu, Nov 29, 2018 at 05:06:15PM +0100, David Hildenbrand wrote:
>> On 29.11.18 16:53, Wei Yang wrote:
>>> pgdat_resize_lock is used to protect pgdat's memory region information
>>> like: node_start_pfn, node_present_pages, etc. While in function
>>> sparse_add/remove_one_section(), pgdat_resize_lock is used to protect
>>> initialization/release of one mem_section. This looks not proper.
>>>
>>> Based on current implementation, even remove this lock, mem_section
>>> is still away from contention, because it is protected by global
>>> mem_hotpulg_lock.
>>
>> s/mem_hotpulg_lock/mem_hotplug_lock/
>>
>>>
>>> Following is the current call trace of sparse_add/remove_one_section()
>>>
>>>     mem_hotplug_begin()
>>>     arch_add_memory()
>>>        add_pages()
>>>            __add_pages()
>>>                __add_section()
>>>                    sparse_add_one_section()
>>>     mem_hotplug_done()
>>>
>>>     mem_hotplug_begin()
>>>     arch_remove_memory()
>>>         __remove_pages()
>>>             __remove_section()
>>>                 sparse_remove_one_section()
>>>     mem_hotplug_done()
>>>
>>> The comment above the pgdat_resize_lock also mentions "Holding this will
>>> also guarantee that any pfn_valid() stays that way.", which is true with
>>> the current implementation and false after this patch. But current
>>> implementation doesn't meet this comment. There isn't any pfn walkers
>>> to take the lock so this looks like a relict from the past. This patch
>>> also removes this comment.
>>
>> Should we start to document which lock is expected to protect what?
>>
>> I suggest adding what you just found out to
>> Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
>> Maybe a new subsection for mem_hotplug_lock. And eventually also
>> pgdat_resize_lock.
> 
> Well, I am not good at document writting. Below is my first trial.  Look
> forward your comments.
> 
> BTW, in case I would send a new version with this, would I put this into
> a separate one or merge this into current one?
> 
> diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
> index 5c4432c96c4b..1548820a0762 100644
> --- a/Documentation/admin-guide/mm/memory-hotplug.rst
> +++ b/Documentation/admin-guide/mm/memory-hotplug.rst

BTW, it really should go into

Documentation/core-api/memory-hotplug.rst

Something got wrong while merging this in linux-next, so now we have
duplicate documentation and the one in
Documentation/admin-guide/mm/memory-hotplug.rst about locking internals
has to go.


> @@ -396,6 +396,20 @@ Need more implementation yet....
>  Locking Internals
>  =================
>  
> +There are three locks involved in memory-hotplug, two global lock and one local
> +lock:
> +
> +- device_hotplug_lock
> +- mem_hotplug_lock
> +- device_lock
> +
> +Currently, they are twisted together for all kinds of reasons. The following
> +part is divded into device_hotplug_lock and mem_hotplug_lock parts
> +respectively to describe those tricky situations.
> +
> +device_hotplug_lock
> +---------------------
> +
>  When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
>  the device_hotplug_lock should be held to:
>  
> @@ -417,14 +431,21 @@ memory faster than expected:
>  As the device is visible to user space before taking the device_lock(), this
>  can result in a lock inversion.
>  
> +mem_hotplug_lock
> +---------------------
> +
>  onlining/offlining of memory should be done via device_online()/
> -device_offline() - to make sure it is properly synchronized to actions
> -via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
> +device_offline() - to make sure it is properly synchronized to actions via
> +sysfs. Even mem_hotplug_lock is used to protect the process, because of the
> +lock inversion described above, holding device_hotplug_lock is still advised
> +(to e.g. protect online_type)
>  
>  When adding/removing/onlining/offlining memory or adding/removing
>  heterogeneous/device memory, we should always hold the mem_hotplug_lock in
>  write mode to serialise memory hotplug (e.g. access to global/zone
> -variables).
> +variables). Currently, we take advantage of this to serialise sparsemem's
> +mem_section handling in sparse_add_one_section() and
> +sparse_remove_one_section().
>  
>  In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
>  mode allows for a quite efficient get_online_mems/put_online_mems
> 
>>
>>
>> Thanks,
>>
>> David / dhildenb
>
Wei Yang Dec. 3, 2018, 9:06 p.m. UTC | #9
On Mon, Dec 03, 2018 at 12:25:20PM +0100, David Hildenbrand wrote:
>On 30.11.18 05:28, Wei Yang wrote:
>> On Thu, Nov 29, 2018 at 05:06:15PM +0100, David Hildenbrand wrote:
>>> On 29.11.18 16:53, Wei Yang wrote:
>>>> pgdat_resize_lock is used to protect pgdat's memory region information
>>>> like: node_start_pfn, node_present_pages, etc. While in function
>>>> sparse_add/remove_one_section(), pgdat_resize_lock is used to protect
>>>> initialization/release of one mem_section. This looks not proper.
>>>>
>>>> Based on current implementation, even remove this lock, mem_section
>>>> is still away from contention, because it is protected by global
>>>> mem_hotpulg_lock.
>>>
>>> s/mem_hotpulg_lock/mem_hotplug_lock/
>>>
>>>>
>>>> Following is the current call trace of sparse_add/remove_one_section()
>>>>
>>>>     mem_hotplug_begin()
>>>>     arch_add_memory()
>>>>        add_pages()
>>>>            __add_pages()
>>>>                __add_section()
>>>>                    sparse_add_one_section()
>>>>     mem_hotplug_done()
>>>>
>>>>     mem_hotplug_begin()
>>>>     arch_remove_memory()
>>>>         __remove_pages()
>>>>             __remove_section()
>>>>                 sparse_remove_one_section()
>>>>     mem_hotplug_done()
>>>>
>>>> The comment above the pgdat_resize_lock also mentions "Holding this will
>>>> also guarantee that any pfn_valid() stays that way.", which is true with
>>>> the current implementation and false after this patch. But current
>>>> implementation doesn't meet this comment. There isn't any pfn walkers
>>>> to take the lock so this looks like a relict from the past. This patch
>>>> also removes this comment.
>>>
>>> Should we start to document which lock is expected to protect what?
>>>
>>> I suggest adding what you just found out to
>>> Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
>>> Maybe a new subsection for mem_hotplug_lock. And eventually also
>>> pgdat_resize_lock.
>> 
>> Well, I am not good at document writting. Below is my first trial.  Look
>> forward your comments.
>> 
>> BTW, in case I would send a new version with this, would I put this into
>> a separate one or merge this into current one?
>> 
>> diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
>> index 5c4432c96c4b..1548820a0762 100644
>> --- a/Documentation/admin-guide/mm/memory-hotplug.rst
>> +++ b/Documentation/admin-guide/mm/memory-hotplug.rst
>
>BTW, it really should go into
>
>Documentation/core-api/memory-hotplug.rst
>
>Something got wrong while merging this in linux-next, so now we have
>duplicate documentation and the one in
>Documentation/admin-guide/mm/memory-hotplug.rst about locking internals
>has to go.
>

Sounds reasonable.

Admin may not necessary need to understand the internal locking.
Wei Yang Dec. 4, 2018, 8:53 a.m. UTC | #10
On Fri, Nov 30, 2018 at 10:52:30AM +0100, Michal Hocko wrote:
>On Fri 30-11-18 10:19:13, David Hildenbrand wrote:
>> >> I suggest adding what you just found out to
>> >> Documentation/admin-guide/mm/memory-hotplug.rst "Locking Internals".
>> >> Maybe a new subsection for mem_hotplug_lock. And eventually also
>> >> pgdat_resize_lock.
>> > 
>> > Well, I am not good at document writting. Below is my first trial.  Look
>> > forward your comments.
>> 
>> I'll have a look, maybe also Oscar and Michal can have a look. I guess
>> we don't have to cover all now, we can add more details as we discover them.
>
>Oscar, didn't you have something already?

Since we prefer to address the document in a separate patch, I will send
out v4 with changes suggested from Michal and David first.

>-- 
>Michal Hocko
>SUSE Labs
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1bb749bee284..0a66085d7ced 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -638,8 +638,6 @@  typedef struct pglist_data {
 	/*
 	 * Must be held any time you expect node_start_pfn,
 	 * node_present_pages, node_spanned_pages or nr_zones stay constant.
-	 * Holding this will also guarantee that any pfn_valid() stays that
-	 * way.
 	 *
 	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
 	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
diff --git a/mm/sparse.c b/mm/sparse.c
index 33307fc05c4d..5825f276485f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -669,7 +669,6 @@  int __meminit sparse_add_one_section(struct pglist_data *pgdat,
 	struct mem_section *ms;
 	struct page *memmap;
 	unsigned long *usemap;
-	unsigned long flags;
 	int ret;
 
 	/*
@@ -689,8 +688,6 @@  int __meminit sparse_add_one_section(struct pglist_data *pgdat,
 		return -ENOMEM;
 	}
 
-	pgdat_resize_lock(pgdat, &flags);
-
 	ms = __pfn_to_section(start_pfn);
 	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
 		ret = -EEXIST;
@@ -707,7 +704,6 @@  int __meminit sparse_add_one_section(struct pglist_data *pgdat,
 	sparse_init_one_section(ms, section_nr, memmap, usemap);
 
 out:
-	pgdat_resize_unlock(pgdat, &flags);
 	if (ret < 0) {
 		kfree(usemap);
 		__kfree_section_memmap(memmap, altmap);
@@ -769,10 +765,8 @@  void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
-	unsigned long *usemap = NULL, flags;
-	struct pglist_data *pgdat = zone->zone_pgdat;
+	unsigned long *usemap = NULL;
 
-	pgdat_resize_lock(pgdat, &flags);
 	if (ms->section_mem_map) {
 		usemap = ms->pageblock_flags;
 		memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -780,7 +774,6 @@  void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 		ms->section_mem_map = 0;
 		ms->pageblock_flags = NULL;
 	}
-	pgdat_resize_unlock(pgdat, &flags);
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);