Message ID | 50BC0D2D.8040008@huawei.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
Hi Wu, Sorry to make noise here. Please see below. :) On 12/03/2012 10:23 AM, Jianguo Wu wrote: > Signed-off-by: Jianguo Wu<wujianguo@huawei.com> > Signed-off-by: Jiang Liu<jiang.liu@huawei.com> > --- > include/linux/mm.h | 1 + > mm/sparse-vmemmap.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++ > mm/sparse.c | 3 +- > 3 files changed, 234 insertions(+), 1 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 5657670..1f26af5 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); > void vmemmap_populate_print_last(void); > void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, > unsigned long size); > +void vmemmap_free(struct page *memmap, unsigned long nr_pages); > > enum mf_flags { > MF_COUNT_INCREASED = 1<< 0, > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > index 1b7e22a..748732d 100644 > --- a/mm/sparse-vmemmap.c > +++ b/mm/sparse-vmemmap.c > @@ -29,6 +29,10 @@ > #include<asm/pgalloc.h> > #include<asm/pgtable.h> > > +#ifdef CONFIG_MEMORY_HOTREMOVE > +#include<asm/tlbflush.h> > +#endif > + > /* > * Allocate a block of memory to be used to back the virtual memory map > * or to back the page tables that are used to create the mapping. > @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, > vmemmap_buf_end = NULL; > } > } > + > +#ifdef CONFIG_MEMORY_HOTREMOVE > + > +#define PAGE_INUSE 0xFD > + > +static void vmemmap_free_pages(struct page *page, int order) > +{ > + struct zone *zone; > + unsigned long magic; > + > + magic = (unsigned long) page->lru.next; > + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { > + put_page_bootmem(page); > + > + zone = page_zone(page); > + zone_span_writelock(zone); > + zone->present_pages++; > + zone_span_writeunlock(zone); > + totalram_pages++; Seems that we have different ways to handle pages allocated by bootmem or by regular allocator. Is the checking way in [PATCH 09/12] available here ? + /* bootmem page has reserved flag */ + if (PageReserved(page)) { ...... + } If so, I think we can just merge these two functions. > + } else > + free_pages((unsigned long)page_address(page), order); > +} > + > +static void free_pte_table(pmd_t *pmd) > +{ > + pte_t *pte, *pte_start; > + int i; > + > + pte_start = (pte_t *)pmd_page_vaddr(*pmd); > + for (i = 0; i< PTRS_PER_PTE; i++) { > + pte = pte_start + i; > + if (pte_val(*pte)) > + return; > + } > + > + /* free a pte talbe */ > + vmemmap_free_pages(pmd_page(*pmd), 0); > + spin_lock(&init_mm.page_table_lock); > + pmd_clear(pmd); > + spin_unlock(&init_mm.page_table_lock); > +} > + > +static void free_pmd_table(pud_t *pud) > +{ > + pmd_t *pmd, *pmd_start; > + int i; > + > + pmd_start = (pmd_t *)pud_page_vaddr(*pud); > + for (i = 0; i< PTRS_PER_PMD; i++) { > + pmd = pmd_start + i; > + if (pmd_val(*pmd)) > + return; > + } > + > + /* free a pmd talbe */ > + vmemmap_free_pages(pud_page(*pud), 0); > + spin_lock(&init_mm.page_table_lock); > + pud_clear(pud); > + spin_unlock(&init_mm.page_table_lock); > +} > + > +static void free_pud_table(pgd_t *pgd) > +{ > + pud_t *pud, *pud_start; > + int i; > + > + pud_start = (pud_t *)pgd_page_vaddr(*pgd); > + for (i = 0; i< PTRS_PER_PUD; i++) { > + pud = pud_start + i; > + if (pud_val(*pud)) > + return; > + } > + > + /* free a pud table */ > + vmemmap_free_pages(pgd_page(*pgd), 0); > + spin_lock(&init_mm.page_table_lock); > + pgd_clear(pgd); > + spin_unlock(&init_mm.page_table_lock); > +} All the free_xxx_table() are very similar to the functions in [PATCH 09/12]. Could we reuse them anyway ? > + > +static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) > +{ > + struct page *page = pmd_page(*(pmd_t *)kpte); > + int i = 0; > + unsigned long magic; > + unsigned long section_nr; > + > + __split_large_page(kpte, address, pbase); Is this patch going to replace [PATCH 08/12] ? If so, __split_large_page() was added and exported in [PATCH 09/12], then we should move it here, right ? If not, free_map_bootmem() and __kfree_section_memmap() were changed in [PATCH 08/12], and we need to handle this. > + __flush_tlb_all(); > + > + magic = (unsigned long) page->lru.next; > + if (magic == SECTION_INFO) { > + section_nr = pfn_to_section_nr(page_to_pfn(page)); > + while (i< PTRS_PER_PMD) { > + page++; > + i++; > + get_page_bootmem(section_nr, page, SECTION_INFO); > + } > + } > + > + return 0; > +} > + > +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long end) > +{ > + pte_t *pte; > + unsigned long next; > + void *page_addr; > + > + pte = pte_offset_kernel(pmd, addr); > + for (; addr< end; pte++, addr += PAGE_SIZE) { > + next = (addr + PAGE_SIZE)& PAGE_MASK; > + if (next> end) > + next = end; > + > + if (pte_none(*pte)) > + continue; > + if (IS_ALIGNED(addr, PAGE_SIZE)&& > + IS_ALIGNED(next, PAGE_SIZE)) { > + vmemmap_free_pages(pte_page(*pte), 0); > + spin_lock(&init_mm.page_table_lock); > + pte_clear(&init_mm, addr, pte); > + spin_unlock(&init_mm.page_table_lock); > + } else { > + /* > + * Removed page structs are filled with 0xFD. > + */ > + memset((void *)addr, PAGE_INUSE, next - addr); > + page_addr = page_address(pte_page(*pte)); > + > + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { > + spin_lock(&init_mm.page_table_lock); > + pte_clear(&init_mm, addr, pte); > + spin_unlock(&init_mm.page_table_lock); > + } > + } > + } > + > + free_pte_table(pmd); > + __flush_tlb_all(); > +} > + > +static void vmemmap_pmd_remove(pud_t *pud, unsigned long addr, unsigned long end) > +{ > + unsigned long next; > + pmd_t *pmd; > + > + pmd = pmd_offset(pud, addr); > + for (; addr< end; addr = next, pmd++) { > + next = pmd_addr_end(addr, end); > + if (pmd_none(*pmd)) > + continue; > + > + if (cpu_has_pse) { > + unsigned long pte_base; > + > + if (IS_ALIGNED(addr, PMD_SIZE)&& > + IS_ALIGNED(next, PMD_SIZE)) { > + vmemmap_free_pages(pmd_page(*pmd), > + get_order(PMD_SIZE)); > + spin_lock(&init_mm.page_table_lock); > + pmd_clear(pmd); > + spin_unlock(&init_mm.page_table_lock); > + continue; > + } > + > + /* > + * We use 2M page, but we need to remove part of them, > + * so split 2M page to 4K page. > + */ > + pte_base = get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); > + if (!pte_base) { > + WARN_ON(1); > + continue; > + } > + > + split_large_page((pte_t *)pmd, addr, (pte_t *)pte_base); > + __flush_tlb_all(); > + > + spin_lock(&init_mm.page_table_lock); > + pmd_populate_kernel(&init_mm, pmd, (pte_t *)pte_base); > + spin_unlock(&init_mm.page_table_lock); > + } > + > + vmemmap_pte_remove(pmd, addr, next); > + } > + > + free_pmd_table(pud); > + __flush_tlb_all(); > +} > + > +static void vmemmap_pud_remove(pgd_t *pgd, unsigned long addr, unsigned long end) > +{ > + unsigned long next; > + pud_t *pud; > + > + pud = pud_offset(pgd, addr); > + for (; addr< end; addr = next, pud++) { > + next = pud_addr_end(addr, end); > + if (pud_none(*pud)) > + continue; > + > + vmemmap_pmd_remove(pud, addr, next); > + } > + > + free_pud_table(pgd); > + __flush_tlb_all(); > +} > + > +void vmemmap_free(struct page *memmap, unsigned long nr_pages) > +{ > + unsigned long addr = (unsigned long)memmap; > + unsigned long end = (unsigned long)(memmap + nr_pages); > + unsigned long next; > + > + for (; addr< end; addr = next) { > + pgd_t *pgd = pgd_offset_k(addr); > + > + next = pgd_addr_end(addr, end); > + if (!pgd_present(*pgd)) > + continue; > + > + vmemmap_pud_remove(pgd, addr, next); > + sync_global_pgds(addr, next - 1); > + } > +} > +#endif > diff --git a/mm/sparse.c b/mm/sparse.c > index fac95f2..4060229 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, > } > static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) > { > - return; /* XXX: Not implemented yet */ > + vmemmap_free(memmap, nr_pages); > } > static void free_map_bootmem(struct page *page, unsigned long nr_pages) In the latest kernel, this line was: static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) > { > + vmemmap_free(page, nr_pages); > } > #else > static struct page *__kmalloc_section_memmap(unsigned long nr_pages) -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Tang, Thanks for your review and comments, Please see below for my reply. On 2012/12/4 17:13, Tang Chen wrote: > Hi Wu, > > Sorry to make noise here. Please see below. :) > > On 12/03/2012 10:23 AM, Jianguo Wu wrote: >> Signed-off-by: Jianguo Wu<wujianguo@huawei.com> >> Signed-off-by: Jiang Liu<jiang.liu@huawei.com> >> --- >> include/linux/mm.h | 1 + >> mm/sparse-vmemmap.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++ >> mm/sparse.c | 3 +- >> 3 files changed, 234 insertions(+), 1 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 5657670..1f26af5 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); >> void vmemmap_populate_print_last(void); >> void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, >> unsigned long size); >> +void vmemmap_free(struct page *memmap, unsigned long nr_pages); >> >> enum mf_flags { >> MF_COUNT_INCREASED = 1<< 0, >> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c >> index 1b7e22a..748732d 100644 >> --- a/mm/sparse-vmemmap.c >> +++ b/mm/sparse-vmemmap.c >> @@ -29,6 +29,10 @@ >> #include<asm/pgalloc.h> >> #include<asm/pgtable.h> >> >> +#ifdef CONFIG_MEMORY_HOTREMOVE >> +#include<asm/tlbflush.h> >> +#endif >> + >> /* >> * Allocate a block of memory to be used to back the virtual memory map >> * or to back the page tables that are used to create the mapping. >> @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, >> vmemmap_buf_end = NULL; >> } >> } >> + >> +#ifdef CONFIG_MEMORY_HOTREMOVE >> + >> +#define PAGE_INUSE 0xFD >> + >> +static void vmemmap_free_pages(struct page *page, int order) >> +{ >> + struct zone *zone; >> + unsigned long magic; >> + >> + magic = (unsigned long) page->lru.next; >> + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { >> + put_page_bootmem(page); >> + >> + zone = page_zone(page); >> + zone_span_writelock(zone); >> + zone->present_pages++; >> + zone_span_writeunlock(zone); >> + totalram_pages++; > > Seems that we have different ways to handle pages allocated by bootmem > or by regular allocator. Is the checking way in [PATCH 09/12] available > here ? > > + /* bootmem page has reserved flag */ > + if (PageReserved(page)) { > ...... > + } > > If so, I think we can just merge these two functions. Hmm, direct mapping table isn't allocated by bootmem allocator such as memblock, can't be free by put_page_bootmem(). But I will try to merge these two functions. > >> + } else >> + free_pages((unsigned long)page_address(page), order); >> +} >> + >> +static void free_pte_table(pmd_t *pmd) >> +{ >> + pte_t *pte, *pte_start; >> + int i; >> + >> + pte_start = (pte_t *)pmd_page_vaddr(*pmd); >> + for (i = 0; i< PTRS_PER_PTE; i++) { >> + pte = pte_start + i; >> + if (pte_val(*pte)) >> + return; >> + } >> + >> + /* free a pte talbe */ >> + vmemmap_free_pages(pmd_page(*pmd), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pmd_clear(pmd); >> + spin_unlock(&init_mm.page_table_lock); >> +} >> + >> +static void free_pmd_table(pud_t *pud) >> +{ >> + pmd_t *pmd, *pmd_start; >> + int i; >> + >> + pmd_start = (pmd_t *)pud_page_vaddr(*pud); >> + for (i = 0; i< PTRS_PER_PMD; i++) { >> + pmd = pmd_start + i; >> + if (pmd_val(*pmd)) >> + return; >> + } >> + >> + /* free a pmd talbe */ >> + vmemmap_free_pages(pud_page(*pud), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pud_clear(pud); >> + spin_unlock(&init_mm.page_table_lock); >> +} >> + >> +static void free_pud_table(pgd_t *pgd) >> +{ >> + pud_t *pud, *pud_start; >> + int i; >> + >> + pud_start = (pud_t *)pgd_page_vaddr(*pgd); >> + for (i = 0; i< PTRS_PER_PUD; i++) { >> + pud = pud_start + i; >> + if (pud_val(*pud)) >> + return; >> + } >> + >> + /* free a pud table */ >> + vmemmap_free_pages(pgd_page(*pgd), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pgd_clear(pgd); >> + spin_unlock(&init_mm.page_table_lock); >> +} > > All the free_xxx_table() are very similar to the functions in > [PATCH 09/12]. Could we reuse them anyway ? yes, we can reuse them. > >> + >> +static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) >> +{ >> + struct page *page = pmd_page(*(pmd_t *)kpte); >> + int i = 0; >> + unsigned long magic; >> + unsigned long section_nr; >> + >> + __split_large_page(kpte, address, pbase); > > Is this patch going to replace [PATCH 08/12] ? > I wish to replace [PATCH 08/12], but need Congyang and Yasuaki to confirm first:) > If so, __split_large_page() was added and exported in [PATCH 09/12], > then we should move it here, right ? yes. and what do you think about moving vmemmap_pud[pmd/pte]_remove() to arch/x86/mm/init_64.c, to be consistent with vmemmap_populate() ? I will rework [PATCH 08/12] and [PATCH 09/12] soon. Thanks, Jianguo Wu. > > If not, free_map_bootmem() and __kfree_section_memmap() were changed in > [PATCH 08/12], and we need to handle this. > >> + __flush_tlb_all(); >> + >> + magic = (unsigned long) page->lru.next; >> + if (magic == SECTION_INFO) { >> + section_nr = pfn_to_section_nr(page_to_pfn(page)); >> + while (i< PTRS_PER_PMD) { >> + page++; >> + i++; >> + get_page_bootmem(section_nr, page, SECTION_INFO); >> + } >> + } >> + >> + return 0; >> +} >> + >> +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long end) >> +{ >> + pte_t *pte; >> + unsigned long next; >> + void *page_addr; >> + >> + pte = pte_offset_kernel(pmd, addr); >> + for (; addr< end; pte++, addr += PAGE_SIZE) { >> + next = (addr + PAGE_SIZE)& PAGE_MASK; >> + if (next> end) >> + next = end; >> + >> + if (pte_none(*pte)) >> + continue; >> + if (IS_ALIGNED(addr, PAGE_SIZE)&& >> + IS_ALIGNED(next, PAGE_SIZE)) { >> + vmemmap_free_pages(pte_page(*pte), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pte_clear(&init_mm, addr, pte); >> + spin_unlock(&init_mm.page_table_lock); >> + } else { >> + /* >> + * Removed page structs are filled with 0xFD. >> + */ >> + memset((void *)addr, PAGE_INUSE, next - addr); >> + page_addr = page_address(pte_page(*pte)); >> + >> + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { >> + spin_lock(&init_mm.page_table_lock); >> + pte_clear(&init_mm, addr, pte); >> + spin_unlock(&init_mm.page_table_lock); >> + } >> + } >> + } >> + >> + free_pte_table(pmd); >> + __flush_tlb_all(); >> +} >> + >> +static void vmemmap_pmd_remove(pud_t *pud, unsigned long addr, unsigned long end) >> +{ >> + unsigned long next; >> + pmd_t *pmd; >> + >> + pmd = pmd_offset(pud, addr); >> + for (; addr< end; addr = next, pmd++) { >> + next = pmd_addr_end(addr, end); >> + if (pmd_none(*pmd)) >> + continue; >> + >> + if (cpu_has_pse) { >> + unsigned long pte_base; >> + >> + if (IS_ALIGNED(addr, PMD_SIZE)&& >> + IS_ALIGNED(next, PMD_SIZE)) { >> + vmemmap_free_pages(pmd_page(*pmd), >> + get_order(PMD_SIZE)); >> + spin_lock(&init_mm.page_table_lock); >> + pmd_clear(pmd); >> + spin_unlock(&init_mm.page_table_lock); >> + continue; >> + } >> + >> + /* >> + * We use 2M page, but we need to remove part of them, >> + * so split 2M page to 4K page. >> + */ >> + pte_base = get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); >> + if (!pte_base) { >> + WARN_ON(1); >> + continue; >> + } >> + >> + split_large_page((pte_t *)pmd, addr, (pte_t *)pte_base); >> + __flush_tlb_all(); >> + >> + spin_lock(&init_mm.page_table_lock); >> + pmd_populate_kernel(&init_mm, pmd, (pte_t *)pte_base); >> + spin_unlock(&init_mm.page_table_lock); >> + } >> + >> + vmemmap_pte_remove(pmd, addr, next); >> + } >> + >> + free_pmd_table(pud); >> + __flush_tlb_all(); >> +} >> + >> +static void vmemmap_pud_remove(pgd_t *pgd, unsigned long addr, unsigned long end) >> +{ >> + unsigned long next; >> + pud_t *pud; >> + >> + pud = pud_offset(pgd, addr); >> + for (; addr< end; addr = next, pud++) { >> + next = pud_addr_end(addr, end); >> + if (pud_none(*pud)) >> + continue; >> + >> + vmemmap_pmd_remove(pud, addr, next); >> + } >> + >> + free_pud_table(pgd); >> + __flush_tlb_all(); >> +} >> + >> +void vmemmap_free(struct page *memmap, unsigned long nr_pages) >> +{ >> + unsigned long addr = (unsigned long)memmap; >> + unsigned long end = (unsigned long)(memmap + nr_pages); >> + unsigned long next; >> + >> + for (; addr< end; addr = next) { >> + pgd_t *pgd = pgd_offset_k(addr); >> + >> + next = pgd_addr_end(addr, end); >> + if (!pgd_present(*pgd)) >> + continue; >> + >> + vmemmap_pud_remove(pgd, addr, next); >> + sync_global_pgds(addr, next - 1); >> + } >> +} >> +#endif >> diff --git a/mm/sparse.c b/mm/sparse.c >> index fac95f2..4060229 100644 >> --- a/mm/sparse.c >> +++ b/mm/sparse.c >> @@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, >> } >> static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) >> { >> - return; /* XXX: Not implemented yet */ >> + vmemmap_free(memmap, nr_pages); >> } >> static void free_map_bootmem(struct page *page, unsigned long nr_pages) > > In the latest kernel, this line was: > static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) > >> { >> + vmemmap_free(page, nr_pages); >> } >> #else >> static struct page *__kmalloc_section_memmap(unsigned long nr_pages) > > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Wu, On 12/04/2012 08:20 PM, Jianguo Wu wrote: (snip) >> >> Seems that we have different ways to handle pages allocated by bootmem >> or by regular allocator. Is the checking way in [PATCH 09/12] available >> here ? >> >> + /* bootmem page has reserved flag */ >> + if (PageReserved(page)) { >> ...... >> + } >> >> If so, I think we can just merge these two functions. > > Hmm, direct mapping table isn't allocated by bootmem allocator such as memblock, can't be free by put_page_bootmem(). > But I will try to merge these two functions. > Oh, I didn't notice this, thanks. :) (snip) >>> + >>> + __split_large_page(kpte, address, pbase); >> >> Is this patch going to replace [PATCH 08/12] ? >> > > I wish to replace [PATCH 08/12], but need Congyang and Yasuaki to confirm first:) > >> If so, __split_large_page() was added and exported in [PATCH 09/12], >> then we should move it here, right ? > > yes. > > and what do you think about moving vmemmap_pud[pmd/pte]_remove() to arch/x86/mm/init_64.c, > to be consistent with vmemmap_populate() ? It is a good idea since pud/pmd/pte related code could be platform dependent. And I'm also trying to move vmemmap_free() to arch/x86/mm/init_64.c too. I want to have a common interface just like vmemmap_populate(). :) > > I will rework [PATCH 08/12] and [PATCH 09/12] soon. I am rebasing the whole patch set now. And I think I chould finish part of your work too. A new patch-set is coming soon, and your rework is also welcome. :) Thanks. :) -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Tang, On 2012/12/5 10:07, Tang Chen wrote: > Hi Wu, > > On 12/04/2012 08:20 PM, Jianguo Wu wrote: > (snip) >>> >>> Seems that we have different ways to handle pages allocated by bootmem >>> or by regular allocator. Is the checking way in [PATCH 09/12] available >>> here ? >>> >>> + /* bootmem page has reserved flag */ >>> + if (PageReserved(page)) { >>> ...... >>> + } >>> >>> If so, I think we can just merge these two functions. >> >> Hmm, direct mapping table isn't allocated by bootmem allocator such as memblock, can't be free by put_page_bootmem(). >> But I will try to merge these two functions. >> > > Oh, I didn't notice this, thanks. :) > > (snip) > >>>> + >>>> + __split_large_page(kpte, address, pbase); >>> >>> Is this patch going to replace [PATCH 08/12] ? >>> >> >> I wish to replace [PATCH 08/12], but need Congyang and Yasuaki to confirm first:) >> >>> If so, __split_large_page() was added and exported in [PATCH 09/12], >>> then we should move it here, right ? >> >> yes. >> >> and what do you think about moving vmemmap_pud[pmd/pte]_remove() to arch/x86/mm/init_64.c, >> to be consistent with vmemmap_populate() ? > > It is a good idea since pud/pmd/pte related code could be platform > dependent. And I'm also trying to move vmemmap_free() to > arch/x86/mm/init_64.c too. I want to have a common interface just > like vmemmap_populate(). :) > Great. >> >> I will rework [PATCH 08/12] and [PATCH 09/12] soon. > > I am rebasing the whole patch set now. And I think I chould finish part > of your work too. A new patch-set is coming soon, and your rework is > also welcome. :) > Since you are rebasing now, I will wait for your new patche-set :). Thanks. Jianguo Wu > Thanks. :) > > > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Wu, I met some problems when I was digging into the code. It's very kind of you if you could help me with that. :) If I misunderstood your code, please tell me. Please see below. :) On 12/03/2012 10:23 AM, Jianguo Wu wrote: > Signed-off-by: Jianguo Wu<wujianguo@huawei.com> > Signed-off-by: Jiang Liu<jiang.liu@huawei.com> > --- > include/linux/mm.h | 1 + > mm/sparse-vmemmap.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++ > mm/sparse.c | 3 +- > 3 files changed, 234 insertions(+), 1 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 5657670..1f26af5 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); > void vmemmap_populate_print_last(void); > void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, > unsigned long size); > +void vmemmap_free(struct page *memmap, unsigned long nr_pages); > > enum mf_flags { > MF_COUNT_INCREASED = 1<< 0, > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > index 1b7e22a..748732d 100644 > --- a/mm/sparse-vmemmap.c > +++ b/mm/sparse-vmemmap.c > @@ -29,6 +29,10 @@ > #include<asm/pgalloc.h> > #include<asm/pgtable.h> > > +#ifdef CONFIG_MEMORY_HOTREMOVE > +#include<asm/tlbflush.h> > +#endif > + > /* > * Allocate a block of memory to be used to back the virtual memory map > * or to back the page tables that are used to create the mapping. > @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, > vmemmap_buf_end = NULL; > } > } > + > +#ifdef CONFIG_MEMORY_HOTREMOVE > + > +#define PAGE_INUSE 0xFD > + > +static void vmemmap_free_pages(struct page *page, int order) > +{ > + struct zone *zone; > + unsigned long magic; > + > + magic = (unsigned long) page->lru.next; > + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { > + put_page_bootmem(page); > + > + zone = page_zone(page); > + zone_span_writelock(zone); > + zone->present_pages++; > + zone_span_writeunlock(zone); > + totalram_pages++; > + } else > + free_pages((unsigned long)page_address(page), order); Here, I think SECTION_INFO and MIX_SECTION_INFO pages are all allocated by bootmem, so I put this function this way. I'm not sure if parameter order is necessary here. It will always be 0 in your code. Is this OK to you ? static void free_pagetable(struct page *page) { struct zone *zone; bool bootmem = false; unsigned long magic; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); bootmem = true; } magic = (unsigned long) page->lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) put_page_bootmem(page); else __free_page(page); /* * SECTION_INFO pages and MIX_SECTION_INFO pages * are all allocated by bootmem. */ if (bootmem) { zone = page_zone(page); zone_span_writelock(zone); zone->present_pages++; zone_span_writeunlock(zone); totalram_pages++; } } (snip) > + > +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long end) > +{ > + pte_t *pte; > + unsigned long next; > + void *page_addr; > + > + pte = pte_offset_kernel(pmd, addr); > + for (; addr< end; pte++, addr += PAGE_SIZE) { > + next = (addr + PAGE_SIZE)& PAGE_MASK; > + if (next> end) > + next = end; > + > + if (pte_none(*pte)) Here, you checked xxx_none() in your vmemmap_xxx_remove(), but you used !xxx_present() in your x86_64 patches. Is it OK if I only check !xxx_present() ? > + continue; > + if (IS_ALIGNED(addr, PAGE_SIZE)&& > + IS_ALIGNED(next, PAGE_SIZE)) { > + vmemmap_free_pages(pte_page(*pte), 0); > + spin_lock(&init_mm.page_table_lock); > + pte_clear(&init_mm, addr, pte); > + spin_unlock(&init_mm.page_table_lock); > + } else { > + /* > + * Removed page structs are filled with 0xFD. > + */ > + memset((void *)addr, PAGE_INUSE, next - addr); > + page_addr = page_address(pte_page(*pte)); > + > + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { > + spin_lock(&init_mm.page_table_lock); > + pte_clear(&init_mm, addr, pte); > + spin_unlock(&init_mm.page_table_lock); Here, since we clear pte, we should also free the page, right ? > + } > + } > + } > + > + free_pte_table(pmd); > + __flush_tlb_all(); > +} > + > +static void vmemmap_pmd_remove(pud_t *pud, unsigned long addr, unsigned long end) > +{ > + unsigned long next; > + pmd_t *pmd; > + > + pmd = pmd_offset(pud, addr); > + for (; addr< end; addr = next, pmd++) { > + next = (addr, end); And by the way, there isn't pte_addr_end() in kernel, why ? I saw you calculated it like this: next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; This logic is very similar to {pmd|pud|pgd}_addr_end(). Shall we add a pte_addr_end() or something ? :) Since there is no such code in kernel for a long time, I think there must be some reasons. I merged free_xxx_table() and remove_xxx_table() as common interfaces. And again, thanks for your patient and nice explanation. :) (snip) -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Tang, On 2012/12/7 9:42, Tang Chen wrote: > Hi Wu, > > I met some problems when I was digging into the code. It's very > kind of you if you could help me with that. :) > > If I misunderstood your code, please tell me. > Please see below. :) > > On 12/03/2012 10:23 AM, Jianguo Wu wrote: >> Signed-off-by: Jianguo Wu<wujianguo@huawei.com> >> Signed-off-by: Jiang Liu<jiang.liu@huawei.com> >> --- >> include/linux/mm.h | 1 + >> mm/sparse-vmemmap.c | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++ >> mm/sparse.c | 3 +- >> 3 files changed, 234 insertions(+), 1 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 5657670..1f26af5 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); >> void vmemmap_populate_print_last(void); >> void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, >> unsigned long size); >> +void vmemmap_free(struct page *memmap, unsigned long nr_pages); >> >> enum mf_flags { >> MF_COUNT_INCREASED = 1<< 0, >> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c >> index 1b7e22a..748732d 100644 >> --- a/mm/sparse-vmemmap.c >> +++ b/mm/sparse-vmemmap.c >> @@ -29,6 +29,10 @@ >> #include<asm/pgalloc.h> >> #include<asm/pgtable.h> >> >> +#ifdef CONFIG_MEMORY_HOTREMOVE >> +#include<asm/tlbflush.h> >> +#endif >> + >> /* >> * Allocate a block of memory to be used to back the virtual memory map >> * or to back the page tables that are used to create the mapping. >> @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, >> vmemmap_buf_end = NULL; >> } >> } >> + >> +#ifdef CONFIG_MEMORY_HOTREMOVE >> + >> +#define PAGE_INUSE 0xFD >> + >> +static void vmemmap_free_pages(struct page *page, int order) >> +{ >> + struct zone *zone; >> + unsigned long magic; >> + >> + magic = (unsigned long) page->lru.next; >> + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { >> + put_page_bootmem(page); >> + >> + zone = page_zone(page); >> + zone_span_writelock(zone); >> + zone->present_pages++; >> + zone_span_writeunlock(zone); >> + totalram_pages++; >> + } else >> + free_pages((unsigned long)page_address(page), order); > > Here, I think SECTION_INFO and MIX_SECTION_INFO pages are all allocated > by bootmem, so I put this function this way. > > I'm not sure if parameter order is necessary here. It will always be 0 > in your code. Is this OK to you ? > parameter order is necessary in cpu_has_pse case: vmemmap_pmd_remove free_pagetable(pmd_page(*pmd), get_order(PMD_SIZE)) > static void free_pagetable(struct page *page) > { > struct zone *zone; > bool bootmem = false; > unsigned long magic; > > /* bootmem page has reserved flag */ > if (PageReserved(page)) { > __ClearPageReserved(page); > bootmem = true; > } > > magic = (unsigned long) page->lru.next; > if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) > put_page_bootmem(page); > else > __free_page(page); > > /* > * SECTION_INFO pages and MIX_SECTION_INFO pages > * are all allocated by bootmem. > */ > if (bootmem) { > zone = page_zone(page); > zone_span_writelock(zone); > zone->present_pages++; > zone_span_writeunlock(zone); > totalram_pages++; > } > } > > (snip) > >> + >> +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long end) >> +{ >> + pte_t *pte; >> + unsigned long next; >> + void *page_addr; >> + >> + pte = pte_offset_kernel(pmd, addr); >> + for (; addr< end; pte++, addr += PAGE_SIZE) { >> + next = (addr + PAGE_SIZE)& PAGE_MASK; >> + if (next> end) >> + next = end; >> + >> + if (pte_none(*pte)) > > Here, you checked xxx_none() in your vmemmap_xxx_remove(), but you used > !xxx_present() in your x86_64 patches. Is it OK if I only check > !xxx_present() ? It is Ok. > >> + continue; >> + if (IS_ALIGNED(addr, PAGE_SIZE)&& >> + IS_ALIGNED(next, PAGE_SIZE)) { >> + vmemmap_free_pages(pte_page(*pte), 0); >> + spin_lock(&init_mm.page_table_lock); >> + pte_clear(&init_mm, addr, pte); >> + spin_unlock(&init_mm.page_table_lock); >> + } else { >> + /* >> + * Removed page structs are filled with 0xFD. >> + */ >> + memset((void *)addr, PAGE_INUSE, next - addr); >> + page_addr = page_address(pte_page(*pte)); >> + >> + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { >> + spin_lock(&init_mm.page_table_lock); >> + pte_clear(&init_mm, addr, pte); >> + spin_unlock(&init_mm.page_table_lock); > > Here, since we clear pte, we should also free the page, right ? > Right, I forgot here, sorry. >> + } >> + } >> + } >> + >> + free_pte_table(pmd); >> + __flush_tlb_all(); >> +} >> + >> +static void vmemmap_pmd_remove(pud_t *pud, unsigned long addr, unsigned long end) >> +{ >> + unsigned long next; >> + pmd_t *pmd; >> + >> + pmd = pmd_offset(pud, addr); >> + for (; addr< end; addr = next, pmd++) { >> + next = (addr, end); > > And by the way, there isn't pte_addr_end() in kernel, why ? > I saw you calculated it like this: > > next = (addr + PAGE_SIZE) & PAGE_MASK; > if (next > end) > next = end; > > This logic is very similar to {pmd|pud|pgd}_addr_end(). Shall we add a > pte_addr_end() or something ? :) Maybe just keep this for now if no other place need pte_addr_end()? > Since there is no such code in kernel for a long time, I think there > must be some reasons. Maybe in current kernel, doesn't deal not PTE_SIZE alignment address? > > I merged free_xxx_table() and remove_xxx_table() as common interfaces. Greate! Thanks for your work:). > > And again, thanks for your patient and nice explanation. :) > > (snip) > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/mm.h b/include/linux/mm.h index 5657670..1f26af5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long size); +void vmemmap_free(struct page *memmap, unsigned long nr_pages); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1b7e22a..748732d 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -29,6 +29,10 @@ #include <asm/pgalloc.h> #include <asm/pgtable.h> +#ifdef CONFIG_MEMORY_HOTREMOVE +#include <asm/tlbflush.h> +#endif + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping. @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, vmemmap_buf_end = NULL; } } + +#ifdef CONFIG_MEMORY_HOTREMOVE + +#define PAGE_INUSE 0xFD + +static void vmemmap_free_pages(struct page *page, int order) +{ + struct zone *zone; + unsigned long magic; + + magic = (unsigned long) page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + put_page_bootmem(page); + + zone = page_zone(page); + zone_span_writelock(zone); + zone->present_pages++; + zone_span_writeunlock(zone); + totalram_pages++; + } else + free_pages((unsigned long)page_address(page), order); +} + +static void free_pte_table(pmd_t *pmd) +{ + pte_t *pte, *pte_start; + int i; + + pte_start = (pte_t *)pmd_page_vaddr(*pmd); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (pte_val(*pte)) + return; + } + + /* free a pte talbe */ + vmemmap_free_pages(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void free_pmd_table(pud_t *pud) +{ + pmd_t *pmd, *pmd_start; + int i; + + pmd_start = (pmd_t *)pud_page_vaddr(*pud); + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (pmd_val(*pmd)) + return; + } + + /* free a pmd talbe */ + vmemmap_free_pages(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +static void free_pud_table(pgd_t *pgd) +{ + pud_t *pud, *pud_start; + int i; + + pud_start = (pud_t *)pgd_page_vaddr(*pgd); + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (pud_val(*pud)) + return; + } + + /* free a pud table */ + vmemmap_free_pages(pgd_page(*pgd), 0); + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); +} + +static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) +{ + struct page *page = pmd_page(*(pmd_t *)kpte); + int i = 0; + unsigned long magic; + unsigned long section_nr; + + __split_large_page(kpte, address, pbase); + __flush_tlb_all(); + + magic = (unsigned long) page->lru.next; + if (magic == SECTION_INFO) { + section_nr = pfn_to_section_nr(page_to_pfn(page)); + while (i < PTRS_PER_PMD) { + page++; + i++; + get_page_bootmem(section_nr, page, SECTION_INFO); + } + } + + return 0; +} + +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + unsigned long next; + void *page_addr; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; pte++, addr += PAGE_SIZE) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (pte_none(*pte)) + continue; + if (IS_ALIGNED(addr, PAGE_SIZE) && + IS_ALIGNED(next, PAGE_SIZE)) { + vmemmap_free_pages(pte_page(*pte), 0); + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } else { + /* + * Removed page structs are filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + page_addr = page_address(pte_page(*pte)); + + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + free_pte_table(pmd); + __flush_tlb_all(); +} + +static void vmemmap_pmd_remove(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) + continue; + + if (cpu_has_pse) { + unsigned long pte_base; + + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + vmemmap_free_pages(pmd_page(*pmd), + get_order(PMD_SIZE)); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + continue; + } + + /* + * We use 2M page, but we need to remove part of them, + * so split 2M page to 4K page. + */ + pte_base = get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + if (!pte_base) { + WARN_ON(1); + continue; + } + + split_large_page((pte_t *)pmd, addr, (pte_t *)pte_base); + __flush_tlb_all(); + + spin_lock(&init_mm.page_table_lock); + pmd_populate_kernel(&init_mm, pmd, (pte_t *)pte_base); + spin_unlock(&init_mm.page_table_lock); + } + + vmemmap_pte_remove(pmd, addr, next); + } + + free_pmd_table(pud); + __flush_tlb_all(); +} + +static void vmemmap_pud_remove(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + pud_t *pud; + + pud = pud_offset(pgd, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) + continue; + + vmemmap_pmd_remove(pud, addr, next); + } + + free_pud_table(pgd); + __flush_tlb_all(); +} + +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ + unsigned long addr = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + unsigned long next; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_offset_k(addr); + + next = pgd_addr_end(addr, end); + if (!pgd_present(*pgd)) + continue; + + vmemmap_pud_remove(pgd, addr, next); + sync_global_pgds(addr, next - 1); + } +} +#endif diff --git a/mm/sparse.c b/mm/sparse.c index fac95f2..4060229 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, } static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - return; /* XXX: Not implemented yet */ + vmemmap_free(memmap, nr_pages); } static void free_map_bootmem(struct page *page, unsigned long nr_pages) { + vmemmap_free(page, nr_pages); } #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages)