diff mbox series

[3/3] x86/64/kexec: Rewrite init_transition_pgtable() with kernel_ident_mapping_init()

Message ID 20240701124334.1855981-4-kirill.shutemov@linux.intel.com (mailing list archive)
State Handled Elsewhere, archived
Headers show
Series x86: Reduce code duplication on page table initialization | expand

Commit Message

Kirill A . Shutemov July 1, 2024, 12:43 p.m. UTC
init_transition_pgtable() setups transitional page tables. Rewrite it
using kernel_ident_mapping_init() to avoid code duplication.

struct kimage_arch changed to track allocated page tables as a list, not
linking them to specific page table levels.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/include/asm/kexec.h       |  5 +-
 arch/x86/kernel/machine_kexec_64.c | 89 +++++++++++-------------------
 2 files changed, 32 insertions(+), 62 deletions(-)

Comments

Huang, Kai July 3, 2024, 11:06 a.m. UTC | #1
On Mon, 2024-07-01 at 15:43 +0300, Kirill A. Shutemov wrote:
> init_transition_pgtable() setups transitional page tables. Rewrite it
> using kernel_ident_mapping_init() to avoid code duplication.

setups -> sets up

> 
> struct kimage_arch changed to track allocated page tables as a list, not
> linking them to specific page table levels.

This doesn't look like imperative mode.  Perhaps change to:

  Change struct kimage_arch to track ...

[...]


>  static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
>  {
> -	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
> -	unsigned long vaddr, paddr;
> -	int result = -ENOMEM;
> -	p4d_t *p4d;
> -	pud_t *pud;
> -	pmd_t *pmd;
> -	pte_t *pte;
> +	struct x86_mapping_info info = {
> +		.alloc_pgt_page	= alloc_transition_pgt_page,
> +		.context	= image,
> +		.page_flag	= __PAGE_KERNEL_LARGE_EXEC,
> +		.kernpg_flag	= _KERNPG_TABLE_NOENC,
> +		.offset = __START_KERNEL_map - phys_base,
> +	};
> +	unsigned long mstart = PAGE_ALIGN_DOWN(__pa(relocate_kernel));
> +	unsigned long mend = mstart + PAGE_SIZE;
>  
> -	vaddr = (unsigned long)relocate_kernel;
> -	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);

Perhaps I am missing something, but this seems a functional change to me.

IIUC the page after image->control_code_page is allocated when loading the
kexec kernel image.  It is a different page from the page where the
relocate_kernel code resides in.

The old code maps relocate_kernel kernel VA to the page after the
control_code_page.  Later in machine_kexec(), the relocate_kernel code is
copied to that page so the mapping can work for that:

	control_page = page_address(image->control_code_page) + PAGE_SIZE;
	__memcpy(control_page, relocate_kernel,
KEXEC_CONTROL_CODE_MAX_SIZE);

The new code in this patch, however, seems just maps the relocate_kernel VA
to the PA of the relocate_kernel, which should be different from the old
mapping.
Kirill A . Shutemov July 4, 2024, 1:44 p.m. UTC | #2
On Wed, Jul 03, 2024 at 11:06:21AM +0000, Huang, Kai wrote:
> >  static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
> >  {
> > -	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
> > -	unsigned long vaddr, paddr;
> > -	int result = -ENOMEM;
> > -	p4d_t *p4d;
> > -	pud_t *pud;
> > -	pmd_t *pmd;
> > -	pte_t *pte;
> > +	struct x86_mapping_info info = {
> > +		.alloc_pgt_page	= alloc_transition_pgt_page,
> > +		.context	= image,
> > +		.page_flag	= __PAGE_KERNEL_LARGE_EXEC,
> > +		.kernpg_flag	= _KERNPG_TABLE_NOENC,
> > +		.offset = __START_KERNEL_map - phys_base,
> > +	};
> > +	unsigned long mstart = PAGE_ALIGN_DOWN(__pa(relocate_kernel));
> > +	unsigned long mend = mstart + PAGE_SIZE;
> >  
> > -	vaddr = (unsigned long)relocate_kernel;
> > -	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
> 
> Perhaps I am missing something, but this seems a functional change to me.
> 
> IIUC the page after image->control_code_page is allocated when loading the
> kexec kernel image.  It is a different page from the page where the
> relocate_kernel code resides in.
> 
> The old code maps relocate_kernel kernel VA to the page after the
> control_code_page.  Later in machine_kexec(), the relocate_kernel code is
> copied to that page so the mapping can work for that:
> 
> 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
> 	__memcpy(control_page, relocate_kernel,
> KEXEC_CONTROL_CODE_MAX_SIZE);
> 
> The new code in this patch, however, seems just maps the relocate_kernel VA
> to the PA of the relocate_kernel, which should be different from the old
> mapping.

Yes, original code maps at relocate_kernel() VA the page with copy of the
relocate_kernel() in control_code_page. But it is safe to map original
relocate_kernel() page there as well as it is not going to be overwritten
until swap_pages(). We are not going to use original relocate_kernel()
page after RET at the end of relocate_kernel().

Does it make any sense?

I will try to explain it in the commit message in the next version.
Huang, Kai July 5, 2024, 10:35 a.m. UTC | #3
On Thu, 2024-07-04 at 16:44 +0300, kirill.shutemov@linux.intel.com wrote:
> On Wed, Jul 03, 2024 at 11:06:21AM +0000, Huang, Kai wrote:
> > >  static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
> > >  {
> > > -	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
> > > -	unsigned long vaddr, paddr;
> > > -	int result = -ENOMEM;
> > > -	p4d_t *p4d;
> > > -	pud_t *pud;
> > > -	pmd_t *pmd;
> > > -	pte_t *pte;
> > > +	struct x86_mapping_info info = {
> > > +		.alloc_pgt_page	= alloc_transition_pgt_page,
> > > +		.context	= image,
> > > +		.page_flag	= __PAGE_KERNEL_LARGE_EXEC,
> > > +		.kernpg_flag	= _KERNPG_TABLE_NOENC,
> > > +		.offset = __START_KERNEL_map - phys_base,
> > > +	};
> > > +	unsigned long mstart = PAGE_ALIGN_DOWN(__pa(relocate_kernel));
> > > +	unsigned long mend = mstart + PAGE_SIZE;
> > >  
> > > -	vaddr = (unsigned long)relocate_kernel;
> > > -	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
> > 
> > Perhaps I am missing something, but this seems a functional change to me.
> > 
> > IIUC the page after image->control_code_page is allocated when loading the
> > kexec kernel image.  It is a different page from the page where the
> > relocate_kernel code resides in.
> > 
> > The old code maps relocate_kernel kernel VA to the page after the
> > control_code_page.  Later in machine_kexec(), the relocate_kernel code is
> > copied to that page so the mapping can work for that:
> > 
> > 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
> > 	__memcpy(control_page, relocate_kernel,
> > KEXEC_CONTROL_CODE_MAX_SIZE);
> > 
> > The new code in this patch, however, seems just maps the relocate_kernel VA
> > to the PA of the relocate_kernel, which should be different from the old
> > mapping.
> 
> Yes, original code maps at relocate_kernel() VA the page with copy of the
> relocate_kernel() in control_code_page. But it is safe to map original
> relocate_kernel() page there as well as it is not going to be overwritten
> until swap_pages(). We are not going to use original relocate_kernel()
> page after RET at the end of relocate_kernel().

I am not super familiar with this, but this doesn't seem 100% safe to me.

E.g, did you consider the kexec jump case?

The second half of control page is also used to store registers in kexec
jump.  If the relocate_kernel VA isn't mapped to the control page, then IIUC
after jumping back to old kernel it seems we won't be able to read those
registers back?

> 
> Does it make any sense?
> 
> I will try to explain it in the commit message in the next version.
> 

I think even it's safe to change to map to the relocate_kernel() page, it
should be done in a separate patch.  This patch should just focus on removing
the duplicated page table setup code.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index ae5482a2f0ca..7f9287f371e6 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -145,10 +145,7 @@  struct kimage_arch {
 };
 #else
 struct kimage_arch {
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	struct list_head pages;
 };
 #endif /* CONFIG_X86_32 */
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cc0f7f70b17b..951b17d217ab 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -107,71 +107,42 @@  map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
 	return 0;
 }
 
+static void *alloc_transition_pgt_page(void *data)
+{
+	struct kimage *image = (struct kimage *)data;
+	unsigned long virt;
+
+	virt = get_zeroed_page(GFP_KERNEL);
+	if (!virt)
+		return NULL;
+
+	list_add(&virt_to_page(virt)->lru, &image->arch.pages);
+	return (void *)virt;
+}
+
 static void free_transition_pgtable(struct kimage *image)
 {
-	free_page((unsigned long)image->arch.p4d);
-	image->arch.p4d = NULL;
-	free_page((unsigned long)image->arch.pud);
-	image->arch.pud = NULL;
-	free_page((unsigned long)image->arch.pmd);
-	image->arch.pmd = NULL;
-	free_page((unsigned long)image->arch.pte);
-	image->arch.pte = NULL;
+	struct page *page, *tmp;
+
+	list_for_each_entry_safe(page, tmp, &image->arch.pages, lru) {
+		list_del(&page->lru);
+		free_page((unsigned long)page_address(page));
+	}
 }
 
 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 {
-	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
-	unsigned long vaddr, paddr;
-	int result = -ENOMEM;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	struct x86_mapping_info info = {
+		.alloc_pgt_page	= alloc_transition_pgt_page,
+		.context	= image,
+		.page_flag	= __PAGE_KERNEL_LARGE_EXEC,
+		.kernpg_flag	= _KERNPG_TABLE_NOENC,
+		.offset = __START_KERNEL_map - phys_base,
+	};
+	unsigned long mstart = PAGE_ALIGN_DOWN(__pa(relocate_kernel));
+	unsigned long mend = mstart + PAGE_SIZE;
 
-	vaddr = (unsigned long)relocate_kernel;
-	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
-	pgd += pgd_index(vaddr);
-	if (!pgd_present(*pgd)) {
-		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
-		if (!p4d)
-			goto err;
-		image->arch.p4d = p4d;
-		set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
-	}
-	p4d = p4d_offset(pgd, vaddr);
-	if (!p4d_present(*p4d)) {
-		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
-		if (!pud)
-			goto err;
-		image->arch.pud = pud;
-		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
-	}
-	pud = pud_offset(p4d, vaddr);
-	if (!pud_present(*pud)) {
-		pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
-		if (!pmd)
-			goto err;
-		image->arch.pmd = pmd;
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-	}
-	pmd = pmd_offset(pud, vaddr);
-	if (!pmd_present(*pmd)) {
-		pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
-		if (!pte)
-			goto err;
-		image->arch.pte = pte;
-		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
-	}
-	pte = pte_offset_kernel(pmd, vaddr);
-
-	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
-		prot = PAGE_KERNEL_EXEC;
-
-	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
-	return 0;
-err:
-	return result;
+	return kernel_ident_mapping_init(&info, pgd, mstart, mend);
 }
 
 static void *alloc_pgt_page(void *data)
@@ -272,6 +243,8 @@  int machine_kexec_prepare(struct kimage *image)
 	unsigned long start_pgtable;
 	int result;
 
+	INIT_LIST_HEAD(&image->arch.pages);
+
 	/* Calculate the offsets */
 	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;