diff mbox series

[v1,15/18] x86: rework domain page allocation

Message ID 20220706210454.30096-16-dpsmith@apertussolutions.com (mailing list archive)
State New, archived
Headers show
Series Hyperlaunch | expand

Commit Message

Daniel P. Smith July 6, 2022, 9:04 p.m. UTC
This reworks all the dom0 page allocation functions for general domain
construction. Where possible, common logic between the two was split into a
separate function for reuse by the two functions.

Signed-off-by: Daniel P. Smith <dpsmith@apertussolutions.com>
Reviewed-by: Christopher Clark <christopher.clark@starlab.io>
---
 xen/arch/x86/dom0_build.c             |  76 ++++++-------------
 xen/arch/x86/domain_builder.c         | 102 ++++++++++++++++++++++++++
 xen/arch/x86/hvm/dom0_build.c         |  11 +--
 xen/arch/x86/include/asm/dom0_build.h |  14 +++-
 xen/arch/x86/pv/dom0_build.c          |   2 +-
 5 files changed, 142 insertions(+), 63 deletions(-)

Comments

Jan Beulich July 27, 2022, 1:22 p.m. UTC | #1
On 06.07.2022 23:04, Daniel P. Smith wrote:
> This reworks all the dom0 page allocation functions for general domain
> construction. Where possible, common logic between the two was split into a
> separate function for reuse by the two functions.

You absolutely need to mention what behavioral / functional changes there
are (intended), even in case it is "none".

> --- a/xen/arch/x86/dom0_build.c
> +++ b/xen/arch/x86/dom0_build.c
> @@ -320,69 +320,31 @@ static unsigned long __init default_nr_pages(unsigned long avail)
>  }
>  
>  unsigned long __init dom0_compute_nr_pages(
> -    struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
> +    struct boot_domain *bd, struct elf_dom_parms *parms,
> +    unsigned long initrd_len)
>  {
> -    nodeid_t node;
> -    unsigned long avail = 0, nr_pages, min_pages, max_pages, iommu_pages = 0;
> +    unsigned long avail, nr_pages, min_pages, max_pages;
>  
>      /* The ordering of operands is to work around a clang5 issue. */
>      if ( CONFIG_DOM0_MEM[0] && !dom0_mem_set )
>          parse_dom0_mem(CONFIG_DOM0_MEM);
>  
> -    for_each_node_mask ( node, dom0_nodes )
> -        avail += avail_domheap_pages_region(node, 0, 0) +
> -                 initial_images_nrpages(node);
> -
> -    /* Reserve memory for further dom0 vcpu-struct allocations... */
> -    avail -= (d->max_vcpus - 1UL)
> -             << get_order_from_bytes(sizeof(struct vcpu));
> -    /* ...and compat_l4's, if needed. */
> -    if ( is_pv_32bit_domain(d) )
> -        avail -= d->max_vcpus - 1;
> -
> -    /* Reserve memory for iommu_dom0_init() (rough estimate). */
> -    if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
> -    {
> -        unsigned int s;
> -
> -        for ( s = 9; s < BITS_PER_LONG; s += 9 )
> -            iommu_pages += max_pdx >> s;
> -
> -        avail -= iommu_pages;
> -    }
> +    avail = dom_avail_nr_pages(bd, dom0_nodes);
>  
> -    if ( paging_mode_enabled(d) || opt_dom0_shadow || opt_pv_l1tf_hwdom )
> +    /* command line overrides configuration */
> +    if (  dom0_mem_set )

Nit: Stray double blanks.

>      {
> -        unsigned long cpu_pages;
> -
> -        nr_pages = get_memsize(&dom0_size, avail) ?: default_nr_pages(avail);
> -
> -        /*
> -         * Clamp according to min/max limits and available memory
> -         * (preliminary).
> -         */
> -        nr_pages = max(nr_pages, get_memsize(&dom0_min_size, avail));
> -        nr_pages = min(nr_pages, get_memsize(&dom0_max_size, avail));
> -        nr_pages = min(nr_pages, avail);
> -
> -        cpu_pages = dom0_paging_pages(d, nr_pages);
> -
> -        if ( !iommu_use_hap_pt(d) )
> -            avail -= cpu_pages;
> -        else if ( cpu_pages > iommu_pages )
> -            avail -= cpu_pages - iommu_pages;

I can't see any of this represented in the new code. Have you gone through
the history of this code, to understand why things are the way they are,
and hence what (corner) cases need to remain behaviorally unchanged?

> @@ -40,6 +42,106 @@ struct vcpu *__init alloc_dom_vcpu0(struct boot_domain *bd)
>  }
>  
>  
> +unsigned long __init dom_avail_nr_pages(
> +    struct boot_domain *bd, nodemask_t nodes)
> +{
> +    unsigned long avail = 0, iommu_pages = 0;
> +    bool is_ctldom = false, is_hwdom = false;
> +    unsigned long nr_pages = bd->meminfo.mem_size.nr_pages;
> +    nodeid_t node;
> +
> +    if ( builder_is_ctldom(bd) )
> +        is_ctldom = true;
> +    if ( builder_is_hwdom(bd) )
> +        is_hwdom = true;
> +
> +    for_each_node_mask ( node, nodes )
> +        avail += avail_domheap_pages_region(node, 0, 0) +
> +                 initial_images_nrpages(node);

I don't think this is suitable for other than Dom0, so I question the
splitting out and generalizing of this logic. For "ordinary" domains
their memory size should be well-defined rather than inferred from
host capacity.

Starting from host capacity also means you become ordering dependent
when it comes to creating (not starting) all the domains: Which one
is to come first? And even with this limited to just Dom0 - is its
size calculated before or after all the other domains were created?

> +    /* Reserve memory for further dom0 vcpu-struct allocations... */

dom0?

> +    avail -= (bd->domain->max_vcpus - 1UL)
> +             << get_order_from_bytes(sizeof(struct vcpu));
> +    /* ...and compat_l4's, if needed. */
> +    if ( is_pv_32bit_domain(bd->domain) )
> +        avail -= bd->domain->max_vcpus - 1;
> +
> +    /* Reserve memory for iommu_dom0_init() (rough estimate). */
> +    if ( is_hwdom && is_iommu_enabled(bd->domain) && !iommu_hwdom_passthrough )

Again the question why this would be Dom0-only.

> +    {
> +        unsigned int s;
> +
> +        for ( s = 9; s < BITS_PER_LONG; s += 9 )
> +            iommu_pages += max_pdx >> s;
> +
> +        avail -= iommu_pages;
> +    }
> +
> +    if ( paging_mode_enabled(bd->domain) ||
> +         (is_ctldom && opt_dom0_shadow) ||
> +         (is_hwdom && opt_pv_l1tf_hwdom) )

An interesting combination of conditions. It (again) looks to me as if
it first needs properly separating Dom0 from hwdom, in an abstract
sense.

> +    {
> +        unsigned long cpu_pages = dom0_paging_pages(bd->domain, nr_pages);
> +
> +        if ( !iommu_use_hap_pt(bd->domain) )
> +            avail -= cpu_pages;
> +        else if ( cpu_pages > iommu_pages )
> +            avail -= cpu_pages - iommu_pages;
> +    }
> +
> +    return avail;
> +}
> +
> +unsigned long __init dom_compute_nr_pages(
> +    struct boot_domain *bd, struct elf_dom_parms *parms,
> +    unsigned long initrd_len)
> +{
> +    unsigned long avail, nr_pages = bd->meminfo.mem_size.nr_pages;
> +
> +    if ( builder_is_initdom(bd) )
> +        return dom0_compute_nr_pages(bd, parms, initrd_len);
> +
> +    avail = dom_avail_nr_pages(bd, node_online_map);
> +
> +    if ( is_pv_domain(bd->domain) && (parms->p2m_base == UNSET_ADDR) )
> +    {
> +        /*
> +         * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
> +         * note) require that there is enough virtual space beyond the initial
> +         * allocation to set up their initial page tables. This space is
> +         * roughly the same size as the p2m table, so make sure the initial
> +         * allocation doesn't consume more than about half the space that's
> +         * available between params.virt_base and the address space end.
> +         */

This duplicates an existing comment (and hence below likely also
existing code) rather than replacing / moving the original. As in
an earlier case - how are the two going to remain in sync?

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c
index 216c9e3590..0600773b8f 100644
--- a/xen/arch/x86/dom0_build.c
+++ b/xen/arch/x86/dom0_build.c
@@ -320,69 +320,31 @@  static unsigned long __init default_nr_pages(unsigned long avail)
 }
 
 unsigned long __init dom0_compute_nr_pages(
-    struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
+    struct boot_domain *bd, struct elf_dom_parms *parms,
+    unsigned long initrd_len)
 {
-    nodeid_t node;
-    unsigned long avail = 0, nr_pages, min_pages, max_pages, iommu_pages = 0;
+    unsigned long avail, nr_pages, min_pages, max_pages;
 
     /* The ordering of operands is to work around a clang5 issue. */
     if ( CONFIG_DOM0_MEM[0] && !dom0_mem_set )
         parse_dom0_mem(CONFIG_DOM0_MEM);
 
-    for_each_node_mask ( node, dom0_nodes )
-        avail += avail_domheap_pages_region(node, 0, 0) +
-                 initial_images_nrpages(node);
-
-    /* Reserve memory for further dom0 vcpu-struct allocations... */
-    avail -= (d->max_vcpus - 1UL)
-             << get_order_from_bytes(sizeof(struct vcpu));
-    /* ...and compat_l4's, if needed. */
-    if ( is_pv_32bit_domain(d) )
-        avail -= d->max_vcpus - 1;
-
-    /* Reserve memory for iommu_dom0_init() (rough estimate). */
-    if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
-    {
-        unsigned int s;
-
-        for ( s = 9; s < BITS_PER_LONG; s += 9 )
-            iommu_pages += max_pdx >> s;
-
-        avail -= iommu_pages;
-    }
+    avail = dom_avail_nr_pages(bd, dom0_nodes);
 
-    if ( paging_mode_enabled(d) || opt_dom0_shadow || opt_pv_l1tf_hwdom )
+    /* command line overrides configuration */
+    if (  dom0_mem_set )
     {
-        unsigned long cpu_pages;
-
-        nr_pages = get_memsize(&dom0_size, avail) ?: default_nr_pages(avail);
-
-        /*
-         * Clamp according to min/max limits and available memory
-         * (preliminary).
-         */
-        nr_pages = max(nr_pages, get_memsize(&dom0_min_size, avail));
-        nr_pages = min(nr_pages, get_memsize(&dom0_max_size, avail));
-        nr_pages = min(nr_pages, avail);
-
-        cpu_pages = dom0_paging_pages(d, nr_pages);
-
-        if ( !iommu_use_hap_pt(d) )
-            avail -= cpu_pages;
-        else if ( cpu_pages > iommu_pages )
-            avail -= cpu_pages - iommu_pages;
+        bd->meminfo.mem_size = dom0_size;
+        bd->meminfo.mem_min = dom0_min_size;
+        bd->meminfo.mem_max = dom0_max_size;
     }
 
-    nr_pages = get_memsize(&dom0_size, avail) ?: default_nr_pages(avail);
-    min_pages = get_memsize(&dom0_min_size, avail);
-    max_pages = get_memsize(&dom0_max_size, avail);
-
-    /* Clamp according to min/max limits and available memory (final). */
-    nr_pages = max(nr_pages, min_pages);
-    nr_pages = min(nr_pages, max_pages);
-    nr_pages = min(nr_pages, avail);
+    nr_pages = get_memsize(&bd->meminfo.mem_size, avail) ?
+               : default_nr_pages(avail);
+    min_pages = get_memsize(&bd->meminfo.mem_min, avail);
+    max_pages = get_memsize(&bd->meminfo.mem_max, avail);
 
-    if ( is_pv_domain(d) &&
+    if ( is_pv_domain(bd->domain) &&
          (parms->p2m_base == UNSET_ADDR) && !memsize_gt_zero(&dom0_size) &&
          (!memsize_gt_zero(&dom0_min_size) || (nr_pages > min_pages)) )
     {
@@ -395,7 +357,8 @@  unsigned long __init dom0_compute_nr_pages(
          * available between params.virt_base and the address space end.
          */
         unsigned long vstart, vend, end;
-        size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long);
+        size_t sizeof_long = is_pv_32bit_domain(bd->domain) ?
+                             sizeof(int) : sizeof(long);
 
         vstart = parms->virt_base;
         vend = round_pgup(parms->virt_kend);
@@ -416,7 +379,12 @@  unsigned long __init dom0_compute_nr_pages(
         }
     }
 
-    d->max_pages = min_t(unsigned long, max_pages, UINT_MAX);
+    /* Clamp according to min/max limits and available memory (final). */
+    nr_pages = max(nr_pages, min_pages);
+    nr_pages = min(nr_pages, max_pages);
+    nr_pages = min(nr_pages, avail);
+
+    bd->domain->max_pages = min_t(unsigned long, max_pages, UINT_MAX);
 
     return nr_pages;
 }
diff --git a/xen/arch/x86/domain_builder.c b/xen/arch/x86/domain_builder.c
index 1a4a6b1ca7..d8babb1090 100644
--- a/xen/arch/x86/domain_builder.c
+++ b/xen/arch/x86/domain_builder.c
@@ -8,7 +8,9 @@ 
 #include <xen/sched.h>
 
 #include <asm/pv/shim.h>
+#include <asm/dom0_build.h>
 #include <asm/setup.h>
+#include <asm/spec_ctrl.h>
 
 extern unsigned long cr4_pv32_mask;
 
@@ -40,6 +42,106 @@  struct vcpu *__init alloc_dom_vcpu0(struct boot_domain *bd)
 }
 
 
+unsigned long __init dom_avail_nr_pages(
+    struct boot_domain *bd, nodemask_t nodes)
+{
+    unsigned long avail = 0, iommu_pages = 0;
+    bool is_ctldom = false, is_hwdom = false;
+    unsigned long nr_pages = bd->meminfo.mem_size.nr_pages;
+    nodeid_t node;
+
+    if ( builder_is_ctldom(bd) )
+        is_ctldom = true;
+    if ( builder_is_hwdom(bd) )
+        is_hwdom = true;
+
+    for_each_node_mask ( node, nodes )
+        avail += avail_domheap_pages_region(node, 0, 0) +
+                 initial_images_nrpages(node);
+
+    /* Reserve memory for further dom0 vcpu-struct allocations... */
+    avail -= (bd->domain->max_vcpus - 1UL)
+             << get_order_from_bytes(sizeof(struct vcpu));
+    /* ...and compat_l4's, if needed. */
+    if ( is_pv_32bit_domain(bd->domain) )
+        avail -= bd->domain->max_vcpus - 1;
+
+    /* Reserve memory for iommu_dom0_init() (rough estimate). */
+    if ( is_hwdom && is_iommu_enabled(bd->domain) && !iommu_hwdom_passthrough )
+    {
+        unsigned int s;
+
+        for ( s = 9; s < BITS_PER_LONG; s += 9 )
+            iommu_pages += max_pdx >> s;
+
+        avail -= iommu_pages;
+    }
+
+    if ( paging_mode_enabled(bd->domain) ||
+         (is_ctldom && opt_dom0_shadow) ||
+         (is_hwdom && opt_pv_l1tf_hwdom) )
+    {
+        unsigned long cpu_pages = dom0_paging_pages(bd->domain, nr_pages);
+
+        if ( !iommu_use_hap_pt(bd->domain) )
+            avail -= cpu_pages;
+        else if ( cpu_pages > iommu_pages )
+            avail -= cpu_pages - iommu_pages;
+    }
+
+    return avail;
+}
+
+unsigned long __init dom_compute_nr_pages(
+    struct boot_domain *bd, struct elf_dom_parms *parms,
+    unsigned long initrd_len)
+{
+    unsigned long avail, nr_pages = bd->meminfo.mem_size.nr_pages;
+
+    if ( builder_is_initdom(bd) )
+        return dom0_compute_nr_pages(bd, parms, initrd_len);
+
+    avail = dom_avail_nr_pages(bd, node_online_map);
+
+    if ( is_pv_domain(bd->domain) && (parms->p2m_base == UNSET_ADDR) )
+    {
+        /*
+         * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
+         * note) require that there is enough virtual space beyond the initial
+         * allocation to set up their initial page tables. This space is
+         * roughly the same size as the p2m table, so make sure the initial
+         * allocation doesn't consume more than about half the space that's
+         * available between params.virt_base and the address space end.
+         */
+        unsigned long vstart, vend, end;
+        size_t sizeof_long = is_pv_32bit_domain(bd->domain) ?
+                             sizeof(int) : sizeof(long);
+
+        vstart = parms->virt_base;
+        vend = round_pgup(parms->virt_kend);
+        if ( !parms->unmapped_initrd )
+            vend += round_pgup(initrd_len);
+        end = vend + nr_pages * sizeof_long;
+
+        if ( end > vstart )
+            end += end - vstart;
+        if ( end <= vstart ||
+             (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) )
+        {
+            end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long);
+            nr_pages = (end - vend) / (2 * sizeof_long);
+            printk("Dom0 memory clipped to %lu pages\n", nr_pages);
+        }
+    }
+
+    /* Clamp according to available memory (final). */
+    nr_pages = min(nr_pages, avail);
+
+    bd->domain->max_pages = min_t(unsigned long, nr_pages, UINT_MAX);
+
+    return nr_pages;
+}
+
 void __init arch_create_dom(
     const struct boot_info *bi, struct boot_domain *bd)
 {
diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index ae3ffc614d..e94392be07 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -412,15 +412,16 @@  static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
     ASSERT(cur_pages == nr_pages);
 }
 
-static void __init pvh_init_p2m(struct domain *d)
+static void __init pvh_init_p2m(struct boot_domain *bd)
 {
-    unsigned long nr_pages = dom0_compute_nr_pages(d, NULL, 0);
+    unsigned long nr_pages = dom_compute_nr_pages(bd, NULL, 0);
     bool preempted;
 
-    pvh_setup_e820(d, nr_pages);
+    pvh_setup_e820(bd->domain, nr_pages);
     do {
         preempted = false;
-        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
+        paging_set_allocation(bd->domain,
+                              dom0_paging_pages(bd->domain, nr_pages),
                               &preempted);
         process_pending_softirqs();
     } while ( preempted );
@@ -1239,7 +1240,7 @@  int __init dom0_construct_pvh(struct boot_domain *bd)
      * be done before the iommu initializion, since iommu initialization code
      * will likely add mappings required by devices to the p2m (ie: RMRRs).
      */
-    pvh_init_p2m(d);
+    pvh_init_p2m(bd);
 
     iommu_hwdom_init(d);
 
diff --git a/xen/arch/x86/include/asm/dom0_build.h b/xen/arch/x86/include/asm/dom0_build.h
index f30e4b860a..6c26ab0878 100644
--- a/xen/arch/x86/include/asm/dom0_build.h
+++ b/xen/arch/x86/include/asm/dom0_build.h
@@ -9,9 +9,17 @@ 
 
 extern unsigned int dom0_memflags;
 
-unsigned long dom0_compute_nr_pages(struct domain *d,
-                                    struct elf_dom_parms *parms,
-                                    unsigned long initrd_len);
+unsigned long dom_avail_nr_pages(
+    struct boot_domain *bd, nodemask_t nodes);
+
+unsigned long dom0_compute_nr_pages(
+    struct boot_domain *bd, struct elf_dom_parms *parms,
+    unsigned long initrd_len);
+
+unsigned long dom_compute_nr_pages(
+    struct boot_domain *bd, struct elf_dom_parms *parms,
+    unsigned long initrd_len);
+
 int dom0_setup_permissions(struct domain *d);
 
 int dom0_construct_pv(struct boot_domain *bd);
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 9d1c9fb8b0..ff5c93fa14 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -428,7 +428,7 @@  int __init dom0_construct_pv(struct boot_domain *bd)
         }
     }
 
-    nr_pages = dom0_compute_nr_pages(d, &parms, initrd_len);
+    nr_pages = dom_compute_nr_pages(bd, &parms, initrd_len);
 
     if ( parms.pae == XEN_PAE_EXTCR3 )
             set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);