diff mbox

[v6,05/15] mm: don't accessed uninitialized struct pages

Message ID 1502138329-123460-6-git-send-email-pasha.tatashin@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Pavel Tatashin Aug. 7, 2017, 8:38 p.m. UTC
In deferred_init_memmap() where all deferred struct pages are initialized
we have a check like this:

    if (page->flags) {
            VM_BUG_ON(page_zone(page) != zone);
            goto free_range;
    }

This way we are checking if the current deferred page has already been
initialized. It works, because memory for struct pages has been zeroed, and
the only way flags are not zero if it went through __init_single_page()
before.  But, once we change the current behavior and won't zero the memory
in memblock allocator, we cannot trust anything inside "struct page"es
until they are initialized. This patch fixes this.

This patch defines a new accessor memblock_get_reserved_pfn_range()
which returns successive ranges of reserved PFNs.  deferred_init_memmap()
calls it to determine if a PFN and its struct page has already been
initialized.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
---
 include/linux/memblock.h |  3 +++
 mm/memblock.c            | 54 ++++++++++++++++++++++++++++++++++++++++++------
 mm/page_alloc.c          | 11 +++++++++-
 3 files changed, 61 insertions(+), 7 deletions(-)

Comments

Michal Hocko Aug. 11, 2017, 9:37 a.m. UTC | #1
On Mon 07-08-17 16:38:39, Pavel Tatashin wrote:
> In deferred_init_memmap() where all deferred struct pages are initialized
> we have a check like this:
> 
>     if (page->flags) {
>             VM_BUG_ON(page_zone(page) != zone);
>             goto free_range;
>     }
> 
> This way we are checking if the current deferred page has already been
> initialized. It works, because memory for struct pages has been zeroed, and
> the only way flags are not zero if it went through __init_single_page()
> before.  But, once we change the current behavior and won't zero the memory
> in memblock allocator, we cannot trust anything inside "struct page"es
> until they are initialized. This patch fixes this.
> 
> This patch defines a new accessor memblock_get_reserved_pfn_range()
> which returns successive ranges of reserved PFNs.  deferred_init_memmap()
> calls it to determine if a PFN and its struct page has already been
> initialized.

Why don't we simply check the pfn against pgdat->first_deferred_pfn?

> Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
> Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> Reviewed-by: Bob Picco <bob.picco@oracle.com>
> ---
>  include/linux/memblock.h |  3 +++
>  mm/memblock.c            | 54 ++++++++++++++++++++++++++++++++++++++++++------
>  mm/page_alloc.c          | 11 +++++++++-
>  3 files changed, 61 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index bae11c7e7bf3..b6a2a610f5e1 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -320,6 +320,9 @@ int memblock_is_map_memory(phys_addr_t addr);
>  int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
>  bool memblock_is_reserved(phys_addr_t addr);
>  bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
> +void memblock_get_reserved_pfn_range(unsigned long pfn,
> +				     unsigned long *pfn_start,
> +				     unsigned long *pfn_end);
>  
>  extern void __memblock_dump_all(void);
>  
> diff --git a/mm/memblock.c b/mm/memblock.c
> index bf14aea6ab70..08f449acfdd1 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -1580,7 +1580,13 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
>  	memblock_cap_memory_range(0, max_addr);
>  }
>  
> -static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
> +/**
> + * Return index in regions array if addr is within the region. Otherwise
> + * return -1. If -1 is returned and *next_idx is not %NULL, sets it to the
> + * next region index or -1 if there is none.
> + */
> +static int __init_memblock memblock_search(struct memblock_type *type,
> +					   phys_addr_t addr, int *next_idx)
>  {
>  	unsigned int left = 0, right = type->cnt;
>  
> @@ -1595,22 +1601,26 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
>  		else
>  			return mid;
>  	} while (left < right);
> +
> +	if (next_idx)
> +		*next_idx = (right == type->cnt) ? -1 : right;
> +
>  	return -1;
>  }
>  
>  bool __init memblock_is_reserved(phys_addr_t addr)
>  {
> -	return memblock_search(&memblock.reserved, addr) != -1;
> +	return memblock_search(&memblock.reserved, addr, NULL) != -1;
>  }
>  
>  bool __init_memblock memblock_is_memory(phys_addr_t addr)
>  {
> -	return memblock_search(&memblock.memory, addr) != -1;
> +	return memblock_search(&memblock.memory, addr, NULL) != -1;
>  }
>  
>  int __init_memblock memblock_is_map_memory(phys_addr_t addr)
>  {
> -	int i = memblock_search(&memblock.memory, addr);
> +	int i = memblock_search(&memblock.memory, addr, NULL);
>  
>  	if (i == -1)
>  		return false;
> @@ -1622,7 +1632,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
>  			 unsigned long *start_pfn, unsigned long *end_pfn)
>  {
>  	struct memblock_type *type = &memblock.memory;
> -	int mid = memblock_search(type, PFN_PHYS(pfn));
> +	int mid = memblock_search(type, PFN_PHYS(pfn), NULL);
>  
>  	if (mid == -1)
>  		return -1;
> @@ -1646,7 +1656,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
>   */
>  int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
>  {
> -	int idx = memblock_search(&memblock.memory, base);
> +	int idx = memblock_search(&memblock.memory, base, NULL);
>  	phys_addr_t end = base + memblock_cap_size(base, &size);
>  
>  	if (idx == -1)
> @@ -1655,6 +1665,38 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
>  		 memblock.memory.regions[idx].size) >= end;
>  }
>  
> +/**
> + * memblock_get_reserved_pfn_range - search for the next reserved region
> + *
> + * @pfn: start searching from this pfn.
> + *
> + * RETURNS:
> + * [start_pfn, end_pfn), where start_pfn >= pfn. If none is found
> + * start_pfn, and end_pfn are both set to ULONG_MAX.
> + */
> +void __init_memblock memblock_get_reserved_pfn_range(unsigned long pfn,
> +						     unsigned long *start_pfn,
> +						     unsigned long *end_pfn)
> +{
> +	struct memblock_type *type = &memblock.reserved;
> +	int next_idx, idx;
> +
> +	idx = memblock_search(type, PFN_PHYS(pfn), &next_idx);
> +	if (idx == -1 && next_idx == -1) {
> +		*start_pfn = ULONG_MAX;
> +		*end_pfn = ULONG_MAX;
> +		return;
> +	}
> +
> +	if (idx == -1) {
> +		idx = next_idx;
> +		*start_pfn = PFN_DOWN(type->regions[idx].base);
> +	} else {
> +		*start_pfn = pfn;
> +	}
> +	*end_pfn = PFN_DOWN(type->regions[idx].base + type->regions[idx].size);
> +}
> +
>  /**
>   * memblock_is_region_reserved - check if a region intersects reserved memory
>   * @base: base of region to check
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 63d16c185736..983de0a8047b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1447,6 +1447,7 @@ static int __init deferred_init_memmap(void *data)
>  	pg_data_t *pgdat = data;
>  	int nid = pgdat->node_id;
>  	struct mminit_pfnnid_cache nid_init_state = { };
> +	unsigned long resv_start_pfn = 0, resv_end_pfn = 0;
>  	unsigned long start = jiffies;
>  	unsigned long nr_pages = 0;
>  	unsigned long walk_start, walk_end;
> @@ -1491,6 +1492,10 @@ static int __init deferred_init_memmap(void *data)
>  			pfn = zone->zone_start_pfn;
>  
>  		for (; pfn < end_pfn; pfn++) {
> +			if (pfn >= resv_end_pfn)
> +				memblock_get_reserved_pfn_range(pfn,
> +								&resv_start_pfn,
> +								&resv_end_pfn);
>  			if (!pfn_valid_within(pfn))
>  				goto free_range;
>  
> @@ -1524,7 +1529,11 @@ static int __init deferred_init_memmap(void *data)
>  				cond_resched();
>  			}
>  
> -			if (page->flags) {
> +			/*
> +			 * Check if this page has already been initialized due
> +			 * to being reserved during boot in memblock.
> +			 */
> +			if (pfn >= resv_start_pfn) {
>  				VM_BUG_ON(page_zone(page) != zone);
>  				goto free_range;
>  			}
> -- 
> 2.14.0
Pavel Tatashin Aug. 11, 2017, 3:55 p.m. UTC | #2
On 08/11/2017 05:37 AM, Michal Hocko wrote:
> On Mon 07-08-17 16:38:39, Pavel Tatashin wrote:
>> In deferred_init_memmap() where all deferred struct pages are initialized
>> we have a check like this:
>>
>>      if (page->flags) {
>>              VM_BUG_ON(page_zone(page) != zone);
>>              goto free_range;
>>      }
>>
>> This way we are checking if the current deferred page has already been
>> initialized. It works, because memory for struct pages has been zeroed, and
>> the only way flags are not zero if it went through __init_single_page()
>> before.  But, once we change the current behavior and won't zero the memory
>> in memblock allocator, we cannot trust anything inside "struct page"es
>> until they are initialized. This patch fixes this.
>>
>> This patch defines a new accessor memblock_get_reserved_pfn_range()
>> which returns successive ranges of reserved PFNs.  deferred_init_memmap()
>> calls it to determine if a PFN and its struct page has already been
>> initialized.
> 
> Why don't we simply check the pfn against pgdat->first_deferred_pfn?

Because we are initializing deferred pages, and all of them have pfn 
greater than pgdat->first_deferred_pfn. However, some of deferred pages 
were already initialized if they were reserved, in this path:

mem_init()
  free_all_bootmem()
   free_low_memory_core_early()
    for_each_reserved_mem_region()
     reserve_bootmem_region()
      init_reserved_page() <- if this is deferred reserved page
       __init_single_pfn()
        __init_single_page()

So, currently, we are using the value of page->flags to figure out if 
this page has been initialized while being part of deferred page, but 
this is not going to work for this project, as we do not zero the memory 
that is backing the struct pages, and size the value of page->flags can 
be anything.
Michal Hocko Aug. 14, 2017, 11:47 a.m. UTC | #3
On Fri 11-08-17 11:55:39, Pasha Tatashin wrote:
> On 08/11/2017 05:37 AM, Michal Hocko wrote:
> >On Mon 07-08-17 16:38:39, Pavel Tatashin wrote:
> >>In deferred_init_memmap() where all deferred struct pages are initialized
> >>we have a check like this:
> >>
> >>     if (page->flags) {
> >>             VM_BUG_ON(page_zone(page) != zone);
> >>             goto free_range;
> >>     }
> >>
> >>This way we are checking if the current deferred page has already been
> >>initialized. It works, because memory for struct pages has been zeroed, and
> >>the only way flags are not zero if it went through __init_single_page()
> >>before.  But, once we change the current behavior and won't zero the memory
> >>in memblock allocator, we cannot trust anything inside "struct page"es
> >>until they are initialized. This patch fixes this.
> >>
> >>This patch defines a new accessor memblock_get_reserved_pfn_range()
> >>which returns successive ranges of reserved PFNs.  deferred_init_memmap()
> >>calls it to determine if a PFN and its struct page has already been
> >>initialized.
> >
> >Why don't we simply check the pfn against pgdat->first_deferred_pfn?
> 
> Because we are initializing deferred pages, and all of them have pfn greater
> than pgdat->first_deferred_pfn. However, some of deferred pages were already
> initialized if they were reserved, in this path:
> 
> mem_init()
>  free_all_bootmem()
>   free_low_memory_core_early()
>    for_each_reserved_mem_region()
>     reserve_bootmem_region()
>      init_reserved_page() <- if this is deferred reserved page
>       __init_single_pfn()
>        __init_single_page()
> 
> So, currently, we are using the value of page->flags to figure out if this
> page has been initialized while being part of deferred page, but this is not
> going to work for this project, as we do not zero the memory that is backing
> the struct pages, and size the value of page->flags can be anything.

True, this is the initialization part I've missed in one of the previous
patches already. Would it be possible to only iterate over !reserved
memory blocks instead? Now that we discard all the metadata later it
should be quite easy to do for_each_memblock_type, no?
Pavel Tatashin Aug. 14, 2017, 1:51 p.m. UTC | #4
>> mem_init()
>>   free_all_bootmem()
>>    free_low_memory_core_early()
>>     for_each_reserved_mem_region()
>>      reserve_bootmem_region()
>>       init_reserved_page() <- if this is deferred reserved page
>>        __init_single_pfn()
>>         __init_single_page()
>>
>> So, currently, we are using the value of page->flags to figure out if this
>> page has been initialized while being part of deferred page, but this is not
>> going to work for this project, as we do not zero the memory that is backing
>> the struct pages, and size the value of page->flags can be anything.
> 
> True, this is the initialization part I've missed in one of the previous
> patches already. Would it be possible to only iterate over !reserved
> memory blocks instead? Now that we discard all the metadata later it
> should be quite easy to do for_each_memblock_type, no?

Hi Michal,

Clever suggestion to add a new iterator to go through unreserved 
existing memory, I do not think there is this iterator available, so it 
would need to be implemented, using similar approach to what I have done 
with a call back.

However, there is a different reason, why I took this current approach.

Daniel Jordan is working on a ktask support:
https://lkml.org/lkml/2017/7/14/666

He and I discussed on how to multi-thread struct pages initialization 
within memory nodes using ktasks. Having this callback interface makes 
that multi-threading quiet easy, improving the boot performance further, 
with his prototype we saw x4-6 improvements (using 4-8 threads per 
node). Reducing the total time it takes to initialize all struct pages 
on machines with terabytes of memory to less than one second.

Pasha
Pavel Tatashin Aug. 17, 2017, 3:28 p.m. UTC | #5
Hi Michal,

I've been looking through this code again, and I think your suggestion 
will work. I did not realize this iterator already exist:

for_each_free_mem_range() basically iterates through (memory && !reserved)

This is exactly what we need here. So, I will update this patch to use 
this iterator, which will simplify it.

Pasha

On 08/14/2017 09:51 AM, Pasha Tatashin wrote:
>>> mem_init()
>>>   free_all_bootmem()
>>>    free_low_memory_core_early()
>>>     for_each_reserved_mem_region()
>>>      reserve_bootmem_region()
>>>       init_reserved_page() <- if this is deferred reserved page
>>>        __init_single_pfn()
>>>         __init_single_page()
>>>
>>> So, currently, we are using the value of page->flags to figure out if 
>>> this
>>> page has been initialized while being part of deferred page, but this 
>>> is not
>>> going to work for this project, as we do not zero the memory that is 
>>> backing
>>> the struct pages, and size the value of page->flags can be anything.
>>
>> True, this is the initialization part I've missed in one of the previous
>> patches already. Would it be possible to only iterate over !reserved
>> memory blocks instead? Now that we discard all the metadata later it
>> should be quite easy to do for_each_memblock_type, no?
> 
> Hi Michal,
> 
> Clever suggestion to add a new iterator to go through unreserved 
> existing memory, I do not think there is this iterator available, so it 
> would need to be implemented, using similar approach to what I have done 
> with a call back.
> 
> However, there is a different reason, why I took this current approach.
> 
> Daniel Jordan is working on a ktask support:
> https://lkml.org/lkml/2017/7/14/666
> 
> He and I discussed on how to multi-thread struct pages initialization 
> within memory nodes using ktasks. Having this callback interface makes 
> that multi-threading quiet easy, improving the boot performance further, 
> with his prototype we saw x4-6 improvements (using 4-8 threads per 
> node). Reducing the total time it takes to initialize all struct pages 
> on machines with terabytes of memory to less than one second.
> 
> Pasha
Michal Hocko Aug. 17, 2017, 3:43 p.m. UTC | #6
On Thu 17-08-17 11:28:23, Pasha Tatashin wrote:
> Hi Michal,
> 
> I've been looking through this code again, and I think your suggestion will
> work. I did not realize this iterator already exist:
> 
> for_each_free_mem_range() basically iterates through (memory && !reserved)
> 
> This is exactly what we need here. So, I will update this patch to use this
> iterator, which will simplify it.

Please have a look at
http://lkml.kernel.org/r/20170815093306.GC29067@dhcp22.suse.cz
I believe we can simply drop the check altogether.
diff mbox

Patch

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bae11c7e7bf3..b6a2a610f5e1 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -320,6 +320,9 @@  int memblock_is_map_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
 bool memblock_is_reserved(phys_addr_t addr);
 bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+void memblock_get_reserved_pfn_range(unsigned long pfn,
+				     unsigned long *pfn_start,
+				     unsigned long *pfn_end);
 
 extern void __memblock_dump_all(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index bf14aea6ab70..08f449acfdd1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1580,7 +1580,13 @@  void __init memblock_mem_limit_remove_map(phys_addr_t limit)
 	memblock_cap_memory_range(0, max_addr);
 }
 
-static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+/**
+ * Return index in regions array if addr is within the region. Otherwise
+ * return -1. If -1 is returned and *next_idx is not %NULL, sets it to the
+ * next region index or -1 if there is none.
+ */
+static int __init_memblock memblock_search(struct memblock_type *type,
+					   phys_addr_t addr, int *next_idx)
 {
 	unsigned int left = 0, right = type->cnt;
 
@@ -1595,22 +1601,26 @@  static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
 		else
 			return mid;
 	} while (left < right);
+
+	if (next_idx)
+		*next_idx = (right == type->cnt) ? -1 : right;
+
 	return -1;
 }
 
 bool __init memblock_is_reserved(phys_addr_t addr)
 {
-	return memblock_search(&memblock.reserved, addr) != -1;
+	return memblock_search(&memblock.reserved, addr, NULL) != -1;
 }
 
 bool __init_memblock memblock_is_memory(phys_addr_t addr)
 {
-	return memblock_search(&memblock.memory, addr) != -1;
+	return memblock_search(&memblock.memory, addr, NULL) != -1;
 }
 
 int __init_memblock memblock_is_map_memory(phys_addr_t addr)
 {
-	int i = memblock_search(&memblock.memory, addr);
+	int i = memblock_search(&memblock.memory, addr, NULL);
 
 	if (i == -1)
 		return false;
@@ -1622,7 +1632,7 @@  int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
 			 unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	struct memblock_type *type = &memblock.memory;
-	int mid = memblock_search(type, PFN_PHYS(pfn));
+	int mid = memblock_search(type, PFN_PHYS(pfn), NULL);
 
 	if (mid == -1)
 		return -1;
@@ -1646,7 +1656,7 @@  int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
  */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
 {
-	int idx = memblock_search(&memblock.memory, base);
+	int idx = memblock_search(&memblock.memory, base, NULL);
 	phys_addr_t end = base + memblock_cap_size(base, &size);
 
 	if (idx == -1)
@@ -1655,6 +1665,38 @@  int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
 		 memblock.memory.regions[idx].size) >= end;
 }
 
+/**
+ * memblock_get_reserved_pfn_range - search for the next reserved region
+ *
+ * @pfn: start searching from this pfn.
+ *
+ * RETURNS:
+ * [start_pfn, end_pfn), where start_pfn >= pfn. If none is found
+ * start_pfn, and end_pfn are both set to ULONG_MAX.
+ */
+void __init_memblock memblock_get_reserved_pfn_range(unsigned long pfn,
+						     unsigned long *start_pfn,
+						     unsigned long *end_pfn)
+{
+	struct memblock_type *type = &memblock.reserved;
+	int next_idx, idx;
+
+	idx = memblock_search(type, PFN_PHYS(pfn), &next_idx);
+	if (idx == -1 && next_idx == -1) {
+		*start_pfn = ULONG_MAX;
+		*end_pfn = ULONG_MAX;
+		return;
+	}
+
+	if (idx == -1) {
+		idx = next_idx;
+		*start_pfn = PFN_DOWN(type->regions[idx].base);
+	} else {
+		*start_pfn = pfn;
+	}
+	*end_pfn = PFN_DOWN(type->regions[idx].base + type->regions[idx].size);
+}
+
 /**
  * memblock_is_region_reserved - check if a region intersects reserved memory
  * @base: base of region to check
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63d16c185736..983de0a8047b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1447,6 +1447,7 @@  static int __init deferred_init_memmap(void *data)
 	pg_data_t *pgdat = data;
 	int nid = pgdat->node_id;
 	struct mminit_pfnnid_cache nid_init_state = { };
+	unsigned long resv_start_pfn = 0, resv_end_pfn = 0;
 	unsigned long start = jiffies;
 	unsigned long nr_pages = 0;
 	unsigned long walk_start, walk_end;
@@ -1491,6 +1492,10 @@  static int __init deferred_init_memmap(void *data)
 			pfn = zone->zone_start_pfn;
 
 		for (; pfn < end_pfn; pfn++) {
+			if (pfn >= resv_end_pfn)
+				memblock_get_reserved_pfn_range(pfn,
+								&resv_start_pfn,
+								&resv_end_pfn);
 			if (!pfn_valid_within(pfn))
 				goto free_range;
 
@@ -1524,7 +1529,11 @@  static int __init deferred_init_memmap(void *data)
 				cond_resched();
 			}
 
-			if (page->flags) {
+			/*
+			 * Check if this page has already been initialized due
+			 * to being reserved during boot in memblock.
+			 */
+			if (pfn >= resv_start_pfn) {
 				VM_BUG_ON(page_zone(page) != zone);
 				goto free_range;
 			}