[v3,4/5] x86: split PVH dom0 builder to hvm/dom0_build.c

Message ID	20170320141426.20780-5-wei.liu2@citrix.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xen.org> From: Wei Liu <wei.liu2@citrix.com> To: Xen-devel <xen-devel@lists.xenproject.org> Date: Mon, 20 Mar 2017 14:14:25 +0000 Message-ID: <20170320141426.20780-5-wei.liu2@citrix.com> In-Reply-To: <20170320141426.20780-1-wei.liu2@citrix.com> References: <20170320141426.20780-1-wei.liu2@citrix.com> MIME-Version: 1.0 Cc: Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>, Jan Beulich <JBeulich@suse.com>, =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> Subject: [Xen-devel] [PATCH v3 4/5] x86: split PVH dom0 builder to hvm/dom0_build.c Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" <xen-devel-bounces@lists.xen.org>

diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c index cb44a20792..9dc2f803ae 100644 --- a/xen/arch/x86/dom0_build.c +++ b/xen/arch/x86/dom0_build.c @@ -50,27 +50,6 @@ static long __initdata dom0_nrpages; static long __initdata dom0_min_nrpages; static long __initdata dom0_max_nrpages = LONG_MAX; -/* - * Have the TSS cover the ISA port range, which makes it - * - 104 bytes base structure - * - 32 bytes interrupt redirection bitmap - * - 128 bytes I/O bitmap - * - one trailing byte - * or a total of 265 bytes. - * - * NB: as PVHv2 Dom0 doesn't have legacy devices (ISA), it shouldn't have any - * business in accessing the ISA port range, much less in real mode, and due to - * the lack of firmware it shouldn't also execute any INT instruction. This is - * done just for consistency with what hvmloader does. - */ -#define HVM_VM86_TSS_SIZE 265 - -static unsigned int __initdata acpi_intr_overrides; -static struct acpi_madt_interrupt_override __initdata *intsrcovr; - -static unsigned int __initdata acpi_nmi_sources; -static struct acpi_madt_nmi_source __initdata *nmisrc; - /* * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>] * @@ -263,8 +242,8 @@ boolean_param("ro-hpet", ro_hpet); unsigned int __initdata dom0_memflags = MEMF_no_dma|MEMF_exact_node; -static unsigned long __init dom0_paging_pages(const struct domain *d, - unsigned long nr_pages) +unsigned long __init dom0_paging_pages(const struct domain *d, + unsigned long nr_pages) { /* Copied from: libxl_get_required_shadow_memory() */ unsigned long memkb = nr_pages * (PAGE_SIZE / 1024); @@ -493,1050 +472,6 @@ int __init dom0_setup_permissions(struct domain *d) return rc; } -static int __init modify_identity_mmio(struct domain *d, unsigned long pfn, - unsigned long nr_pages, const bool map) -{ - int rc; - - for ( ; ; ) - { - rc = (map ? map_mmio_regions : unmap_mmio_regions) - (d, _gfn(pfn), nr_pages, _mfn(pfn)); - if ( rc == 0 ) - break; - if ( rc < 0 ) - { - printk(XENLOG_WARNING - "Failed to identity %smap [%#lx,%#lx) for d%d: %d\n", - map ? "" : "un", pfn, pfn + nr_pages, d->domain_id, rc); - break; - } - nr_pages -= rc; - pfn += rc; - process_pending_softirqs(); - } - - return rc; -} - -/* Populate a HVM memory range using the biggest possible order. */ -static int __init pvh_populate_memory_range(struct domain *d, - unsigned long start, - unsigned long nr_pages) -{ - unsigned int order, i = 0; - struct page_info *page; - int rc; -#define MAP_MAX_ITER 64 - - order = MAX_ORDER; - while ( nr_pages != 0 ) - { - unsigned int range_order = get_order_from_pages(nr_pages + 1); - - order = min(range_order ? range_order - 1 : 0, order); - page = alloc_domheap_pages(d, order, dom0_memflags); - if ( page == NULL ) - { - if ( order == 0 && dom0_memflags ) - { - /* Try again without any dom0_memflags. */ - dom0_memflags = 0; - order = MAX_ORDER; - continue; - } - if ( order == 0 ) - { - printk("Unable to allocate memory with order 0!\n"); - return -ENOMEM; - } - order--; - continue; - } - - rc = guest_physmap_add_page(d, _gfn(start), _mfn(page_to_mfn(page)), - order); - if ( rc != 0 ) - { - printk("Failed to populate memory: [%#lx,%lx): %d\n", - start, start + (1UL << order), rc); - return -ENOMEM; - } - start += 1UL << order; - nr_pages -= 1UL << order; - if ( (++i % MAP_MAX_ITER) == 0 ) - process_pending_softirqs(); - } - - return 0; -#undef MAP_MAX_ITER -} - -/* Steal RAM from the end of a memory region. */ -static int __init pvh_steal_ram(struct domain *d, unsigned long size, - unsigned long align, paddr_t limit, - paddr_t *addr) -{ - unsigned int i = d->arch.nr_e820; - - /* - * Alignment 0 should be set to 1, so it doesn't wrap around in the - * calculations below. - */ - align = align ? : 1; - while ( i-- ) - { - struct e820entry *entry = &d->arch.e820[i]; - - if ( entry->type != E820_RAM || entry->addr + entry->size > limit || - entry->addr < MB(1) ) - continue; - - *addr = (entry->addr + entry->size - size) & ~(align - 1); - if ( *addr < entry->addr ) - continue; - - entry->size = *addr - entry->addr; - return 0; - } - - return -ENOMEM; -} - -/* NB: memory map must be sorted at all times for this to work correctly. */ -static int __init pvh_add_mem_range(struct domain *d, uint64_t s, uint64_t e, - unsigned int type) -{ - struct e820entry *map; - unsigned int i; - - for ( i = 0; i < d->arch.nr_e820; i++ ) - { - uint64_t rs = d->arch.e820[i].addr; - uint64_t re = rs + d->arch.e820[i].size; - - if ( rs == e && d->arch.e820[i].type == type ) - { - d->arch.e820[i].addr = s; - return 0; - } - - if ( re == s && d->arch.e820[i].type == type && - (i + 1 == d->arch.nr_e820 || d->arch.e820[i + 1].addr >= e) ) - { - d->arch.e820[i].size += e - s; - return 0; - } - - if ( rs >= e ) - break; - - if ( re > s ) - return -EEXIST; - } - - map = xzalloc_array(struct e820entry, d->arch.nr_e820 + 1); - if ( !map ) - { - printk(XENLOG_WARNING "E820: out of memory to add region\n"); - return -ENOMEM; - } - - memcpy(map, d->arch.e820, i * sizeof(*d->arch.e820)); - memcpy(map + i + 1, d->arch.e820 + i, - (d->arch.nr_e820 - i) * sizeof(*d->arch.e820)); - map[i].addr = s; - map[i].size = e - s; - map[i].type = type; - xfree(d->arch.e820); - d->arch.e820 = map; - d->arch.nr_e820++; - - return 0; -} - -static int __init pvh_setup_vmx_realmode_helpers(struct domain *d) -{ - p2m_type_t p2mt; - uint32_t rc, *ident_pt; - mfn_t mfn; - paddr_t gaddr; - struct vcpu *v = d->vcpu[0]; - - /* - * Steal some space from the last RAM region below 4GB and use it to - * store the real-mode TSS. It needs to be aligned to 128 so that the - * TSS structure (which accounts for the first 104b) doesn't cross - * a page boundary. - */ - if ( !pvh_steal_ram(d, HVM_VM86_TSS_SIZE, 128, GB(4), &gaddr) ) - { - if ( hvm_copy_to_guest_phys(gaddr, NULL, HVM_VM86_TSS_SIZE, v) != - HVMCOPY_okay ) - printk("Unable to zero VM86 TSS area\n"); - d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED] = - VM86_TSS_UPDATED | ((uint64_t)HVM_VM86_TSS_SIZE << 32) | gaddr; - if ( pvh_add_mem_range(d, gaddr, gaddr + HVM_VM86_TSS_SIZE, - E820_RESERVED) ) - printk("Unable to set VM86 TSS as reserved in the memory map\n"); - } - else - printk("Unable to allocate VM86 TSS area\n"); - - /* Steal some more RAM for the identity page tables. */ - if ( pvh_steal_ram(d, PAGE_SIZE, PAGE_SIZE, GB(4), &gaddr) ) - { - printk("Unable to find memory to stash the identity page tables\n"); - return -ENOMEM; - } - - /* - * Identity-map page table is required for running with CR0.PG=0 - * when using Intel EPT. Create a 32-bit non-PAE page directory of - * superpages. - */ - ident_pt = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)), - &mfn, &p2mt, 0, &rc); - if ( ident_pt == NULL ) - { - printk("Unable to map identity page tables\n"); - return -ENOMEM; - } - write_32bit_pse_identmap(ident_pt); - unmap_domain_page(ident_pt); - put_page(mfn_to_page(mfn_x(mfn))); - d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] = gaddr; - if ( pvh_add_mem_range(d, gaddr, gaddr + PAGE_SIZE, E820_RESERVED) ) - printk("Unable to set identity page tables as reserved in the memory map\n"); - - return 0; -} - -/* Assign the low 1MB to Dom0. */ -static void __init pvh_steal_low_ram(struct domain *d, unsigned long start, - unsigned long nr_pages) -{ - unsigned long mfn; - - ASSERT(start + nr_pages <= PFN_DOWN(MB(1))); - - for ( mfn = start; mfn < start + nr_pages; mfn++ ) - { - struct page_info *pg = mfn_to_page(mfn); - int rc; - - rc = unshare_xen_page_with_guest(pg, dom_io); - if ( rc ) - { - printk("Unable to unshare Xen mfn %#lx: %d\n", mfn, rc); - continue; - } - - share_xen_page_with_guest(pg, d, XENSHARE_writable); - rc = guest_physmap_add_entry(d, _gfn(mfn), _mfn(mfn), 0, p2m_ram_rw); - if ( rc ) - printk("Unable to add mfn %#lx to p2m: %d\n", mfn, rc); - } -} - -static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) -{ - struct e820entry *entry, *entry_guest; - unsigned int i; - unsigned long pages, cur_pages = 0; - uint64_t start, end; - - /* - * Craft the e820 memory map for Dom0 based on the hardware e820 map. - */ - d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map); - if ( !d->arch.e820 ) - panic("Unable to allocate memory for Dom0 e820 map"); - entry_guest = d->arch.e820; - - /* Clamp e820 memory map to match the memory assigned to Dom0 */ - for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) - { - if ( entry->type != E820_RAM ) - { - *entry_guest = *entry; - goto next; - } - - if ( nr_pages == cur_pages ) - { - /* - * We already have all the assigned memory, - * skip this entry - */ - continue; - } - - /* - * Make sure the start and length are aligned to PAGE_SIZE, because - * that's the minimum granularity of the 2nd stage translation. Since - * the p2m code uses PAGE_ORDER_4K internally, also use it here in - * order to prevent this code from getting out of sync. - */ - start = ROUNDUP(entry->addr, PAGE_SIZE << PAGE_ORDER_4K); - end = (entry->addr + entry->size) & - ~((PAGE_SIZE << PAGE_ORDER_4K) - 1); - if ( start >= end ) - continue; - - entry_guest->type = E820_RAM; - entry_guest->addr = start; - entry_guest->size = end - start; - pages = PFN_DOWN(entry_guest->size); - if ( (cur_pages + pages) > nr_pages ) - { - /* Truncate region */ - entry_guest->size = (nr_pages - cur_pages) << PAGE_SHIFT; - cur_pages = nr_pages; - } - else - { - cur_pages += pages; - } - next: - d->arch.nr_e820++; - entry_guest++; - } - ASSERT(cur_pages == nr_pages); - ASSERT(d->arch.nr_e820 <= e820.nr_map); -} - -static int __init pvh_setup_p2m(struct domain *d) -{ - struct vcpu *v = d->vcpu[0]; - unsigned long nr_pages; - unsigned int i; - int rc; - bool preempted; -#define MB1_PAGES PFN_DOWN(MB(1)) - - nr_pages = dom0_compute_nr_pages(d, NULL, 0); - - pvh_setup_e820(d, nr_pages); - do { - preempted = false; - paging_set_allocation(d, dom0_paging_pages(d, nr_pages), - &preempted); - process_pending_softirqs(); - } while ( preempted ); - - /* - * Memory below 1MB is identity mapped. - * NB: this only makes sense when booted from legacy BIOS. - */ - rc = modify_identity_mmio(d, 0, MB1_PAGES, true); - if ( rc ) - { - printk("Failed to identity map low 1MB: %d\n", rc); - return rc; - } - - /* Populate memory map. */ - for ( i = 0; i < d->arch.nr_e820; i++ ) - { - unsigned long addr, size; - - if ( d->arch.e820[i].type != E820_RAM ) - continue; - - addr = PFN_DOWN(d->arch.e820[i].addr); - size = PFN_DOWN(d->arch.e820[i].size); - - if ( addr >= MB1_PAGES ) - rc = pvh_populate_memory_range(d, addr, size); - else - { - ASSERT(addr + size < MB1_PAGES); - pvh_steal_low_ram(d, addr, size); - } - - if ( rc ) - return rc; - } - - if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) - { - /* - * Since Dom0 cannot be migrated, we will only setup the - * unrestricted guest helpers if they are needed by the current - * hardware we are running on. - */ - rc = pvh_setup_vmx_realmode_helpers(d); - if ( rc ) - return rc; - } - - return 0; -#undef MB1_PAGES -} - -static int __init pvh_load_kernel(struct domain *d, const module_t *image, - unsigned long image_headroom, - module_t *initrd, void *image_base, - char *cmdline, paddr_t *entry, - paddr_t *start_info_addr) -{ - void *image_start = image_base + image_headroom; - unsigned long image_len = image->mod_end; - struct elf_binary elf; - struct elf_dom_parms parms; - paddr_t last_addr; - struct hvm_start_info start_info = { 0 }; - struct hvm_modlist_entry mod = { 0 }; - struct vcpu *v = d->vcpu[0]; - int rc; - - if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 ) - { - printk("Error trying to detect bz compressed kernel\n"); - return rc; - } - - if ( (rc = elf_init(&elf, image_start, image_len)) != 0 ) - { - printk("Unable to init ELF\n"); - return rc; - } -#ifdef VERBOSE - elf_set_verbose(&elf); -#endif - elf_parse_binary(&elf); - if ( (rc = elf_xen_parse(&elf, &parms)) != 0 ) - { - printk("Unable to parse kernel for ELFNOTES\n"); - return rc; - } - - if ( parms.phys_entry == UNSET_ADDR32 ) - { - printk("Unable to find XEN_ELFNOTE_PHYS32_ENTRY address\n"); - return -EINVAL; - } - - printk("OS: %s version: %s loader: %s bitness: %s\n", parms.guest_os, - parms.guest_ver, parms.loader, - elf_64bit(&elf) ? "64-bit" : "32-bit"); - - /* Copy the OS image and free temporary buffer. */ - elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base); - elf.dest_size = parms.virt_kend - parms.virt_kstart; - - elf_set_vcpu(&elf, v); - rc = elf_load_binary(&elf); - if ( rc < 0 ) - { - printk("Failed to load kernel: %d\n", rc); - printk("Xen dom0 kernel broken ELF: %s\n", elf_check_broken(&elf)); - return rc; - } - - last_addr = ROUNDUP(parms.virt_kend - parms.virt_base, PAGE_SIZE); - - if ( initrd != NULL ) - { - rc = hvm_copy_to_guest_phys(last_addr, mfn_to_virt(initrd->mod_start), - initrd->mod_end, v); - if ( rc ) - { - printk("Unable to copy initrd to guest\n"); - return rc; - } - - mod.paddr = last_addr; - mod.size = initrd->mod_end; - last_addr += ROUNDUP(initrd->mod_end, PAGE_SIZE); - } - - /* Free temporary buffers. */ - discard_initial_images(); - - if ( cmdline != NULL ) - { - rc = hvm_copy_to_guest_phys(last_addr, cmdline, strlen(cmdline) + 1, v); - if ( rc ) - { - printk("Unable to copy guest command line\n"); - return rc; - } - start_info.cmdline_paddr = last_addr; - /* - * Round up to 32/64 bits (depending on the guest kernel bitness) so - * the modlist/start_info is aligned. - */ - last_addr += ROUNDUP(strlen(cmdline) + 1, elf_64bit(&elf) ? 8 : 4); - } - if ( initrd != NULL ) - { - rc = hvm_copy_to_guest_phys(last_addr, &mod, sizeof(mod), v); - if ( rc ) - { - printk("Unable to copy guest modules\n"); - return rc; - } - start_info.modlist_paddr = last_addr; - start_info.nr_modules = 1; - last_addr += sizeof(mod); - } - - start_info.magic = XEN_HVM_START_MAGIC_VALUE; - start_info.flags = SIF_PRIVILEGED | SIF_INITDOMAIN; - rc = hvm_copy_to_guest_phys(last_addr, &start_info, sizeof(start_info), v); - if ( rc ) - { - printk("Unable to copy start info to guest\n"); - return rc; - } - - *entry = parms.phys_entry; - *start_info_addr = last_addr; - - return 0; -} - -static int __init pvh_setup_cpus(struct domain *d, paddr_t entry, - paddr_t start_info) -{ - struct vcpu *v = d->vcpu[0]; - unsigned int cpu, i; - int rc; - /* - * This sets the vCPU state according to the state described in - * docs/misc/hvmlite.markdown. - */ - vcpu_hvm_context_t cpu_ctx = { - .mode = VCPU_HVM_MODE_32B, - .cpu_regs.x86_32.ebx = start_info, - .cpu_regs.x86_32.eip = entry, - .cpu_regs.x86_32.cr0 = X86_CR0_PE | X86_CR0_ET, - .cpu_regs.x86_32.cs_limit = ~0u, - .cpu_regs.x86_32.ds_limit = ~0u, - .cpu_regs.x86_32.ss_limit = ~0u, - .cpu_regs.x86_32.tr_limit = 0x67, - .cpu_regs.x86_32.cs_ar = 0xc9b, - .cpu_regs.x86_32.ds_ar = 0xc93, - .cpu_regs.x86_32.ss_ar = 0xc93, - .cpu_regs.x86_32.tr_ar = 0x8b, - }; - - cpu = v->processor; - for ( i = 1; i < d->max_vcpus; i++ ) - { - struct vcpu *p = dom0_setup_vcpu(d, i, cpu); - - if ( p ) - cpu = p->processor; - } - - rc = arch_set_info_hvm_guest(v, &cpu_ctx); - if ( rc ) - { - printk("Unable to setup Dom0 BSP context: %d\n", rc); - return rc; - } - - rc = dom0_setup_permissions(d); - if ( rc ) - { - panic("Unable to setup Dom0 permissions: %d\n", rc); - return rc; - } - - update_domain_wallclock_time(d); - - clear_bit(_VPF_down, &v->pause_flags); - - return 0; -} - -static int __init acpi_count_intr_ovr(struct acpi_subtable_header *header, - const unsigned long end) -{ - - acpi_intr_overrides++; - return 0; -} - -static int __init acpi_set_intr_ovr(struct acpi_subtable_header *header, - const unsigned long end) -{ - const struct acpi_madt_interrupt_override *intr = - container_of(header, struct acpi_madt_interrupt_override, header); - - *intsrcovr = *intr; - intsrcovr++; - - return 0; -} - -static int __init acpi_count_nmi_src(struct acpi_subtable_header *header, - const unsigned long end) -{ - - acpi_nmi_sources++; - return 0; -} - -static int __init acpi_set_nmi_src(struct acpi_subtable_header *header, - const unsigned long end) -{ - const struct acpi_madt_nmi_source *src = - container_of(header, struct acpi_madt_nmi_source, header); - - *nmisrc = *src; - nmisrc++; - - return 0; -} - -static int __init pvh_setup_acpi_madt(struct domain *d, paddr_t *addr) -{ - struct acpi_table_madt *madt; - struct acpi_table_header *table; - struct acpi_madt_io_apic *io_apic; - struct acpi_madt_local_x2apic *x2apic; - acpi_status status; - unsigned long size; - unsigned int i, max_vcpus; - int rc; - - /* Count number of interrupt overrides in the MADT. */ - acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, - acpi_count_intr_ovr, UINT_MAX); - - /* Count number of NMI sources in the MADT. */ - acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_count_nmi_src, - UINT_MAX); - - max_vcpus = dom0_max_vcpus(); - /* Calculate the size of the crafted MADT. */ - size = sizeof(*madt); - /* - * FIXME: the current vIO-APIC code just supports one IO-APIC instance - * per domain. This must be fixed in order to provide the same amount of - * IO APICs as available on bare metal. - */ - size += sizeof(*io_apic); - size += sizeof(*intsrcovr) * acpi_intr_overrides; - size += sizeof(*nmisrc) * acpi_nmi_sources; - size += sizeof(*x2apic) * max_vcpus; - - madt = xzalloc_bytes(size); - if ( !madt ) - { - printk("Unable to allocate memory for MADT table\n"); - rc = -ENOMEM; - goto out; - } - - /* Copy the native MADT table header. */ - status = acpi_get_table(ACPI_SIG_MADT, 0, &table); - if ( !ACPI_SUCCESS(status) ) - { - printk("Failed to get MADT ACPI table, aborting.\n"); - rc = -EINVAL; - goto out; - } - madt->header = *table; - madt->address = APIC_DEFAULT_PHYS_BASE; - /* - * NB: this is currently set to 4, which is the revision in the ACPI - * spec 6.1. Sadly ACPICA doesn't provide revision numbers for the - * tables described in the headers. - */ - madt->header.revision = min_t(unsigned char, table->revision, 4); - - /* - * Setup the IO APIC entry. - * FIXME: the current vIO-APIC code just supports one IO-APIC instance - * per domain. This must be fixed in order to provide the same amount of - * IO APICs as available on bare metal, and with the same IDs as found in - * the native IO APIC MADT entries. - */ - if ( nr_ioapics > 1 ) - printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 emulated IO APIC\n", - nr_ioapics); - io_apic = (void *)(madt + 1); - io_apic->header.type = ACPI_MADT_TYPE_IO_APIC; - io_apic->header.length = sizeof(*io_apic); - io_apic->id = domain_vioapic(d)->id; - io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS; - - x2apic = (void *)(io_apic + 1); - for ( i = 0; i < max_vcpus; i++ ) - { - x2apic->header.type = ACPI_MADT_TYPE_LOCAL_X2APIC; - x2apic->header.length = sizeof(*x2apic); - x2apic->uid = i; - x2apic->local_apic_id = i * 2; - x2apic->lapic_flags = ACPI_MADT_ENABLED; - x2apic++; - } - - /* Setup interrupt overrides. */ - intsrcovr = (void *)x2apic; - acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_set_intr_ovr, - acpi_intr_overrides); - - /* Setup NMI sources. */ - nmisrc = (void *)intsrcovr; - acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_set_nmi_src, - acpi_nmi_sources); - - ASSERT(((void *)nmisrc - (void *)madt) == size); - madt->header.length = size; - /* - * Calling acpi_tb_checksum here is a layering violation, but - * introducing a wrapper for such simple usage seems overkill. - */ - madt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, madt), size); - - /* Place the new MADT in guest memory space. */ - if ( pvh_steal_ram(d, size, 0, GB(4), addr) ) - { - printk("Unable to find allocate guest RAM for MADT\n"); - rc = -ENOMEM; - goto out; - } - - /* Mark this region as E820_ACPI. */ - if ( pvh_add_mem_range(d, *addr, *addr + size, E820_ACPI) ) - printk("Unable to add MADT region to memory map\n"); - - rc = hvm_copy_to_guest_phys(*addr, madt, size, d->vcpu[0]); - if ( rc ) - { - printk("Unable to copy MADT into guest memory\n"); - goto out; - } - - rc = 0; - - out: - xfree(madt); - - return rc; -} - -static bool __init acpi_memory_banned(unsigned long address, - unsigned long size) -{ - unsigned long mfn, nr_pages, i; - - mfn = PFN_DOWN(address); - nr_pages = PFN_UP((address & ~PAGE_MASK) + size); - for ( i = 0 ; i < nr_pages; i++ ) - if ( !page_is_ram_type(mfn + i, RAM_TYPE_RESERVED) && - !page_is_ram_type(mfn + i, RAM_TYPE_ACPI) ) - return true; - - return false; -} - -static bool __init pvh_acpi_table_allowed(const char *sig) -{ - static const char __initconst banned_tables[][ACPI_NAME_SIZE] = { - ACPI_SIG_HPET, ACPI_SIG_SLIT, ACPI_SIG_SRAT, ACPI_SIG_MPST, - ACPI_SIG_PMTT, ACPI_SIG_MADT, ACPI_SIG_DMAR}; - unsigned int i; - - for ( i = 0 ; i < ARRAY_SIZE(banned_tables); i++ ) - if ( strncmp(sig, banned_tables[i], ACPI_NAME_SIZE) == 0 ) - return false; - - /* Make sure table doesn't reside in a RAM region. */ - if ( acpi_memory_banned(acpi_gbl_root_table_list.tables[i].address, - acpi_gbl_root_table_list.tables[i].length) ) - { - printk("Skipping table %.4s because resides in a non-ACPI, non-reserved region\n", - sig); - return false; - } - - return true; -} - -static int __init pvh_setup_acpi_xsdt(struct domain *d, paddr_t madt_addr, - paddr_t *addr) -{ - struct acpi_table_xsdt *xsdt; - struct acpi_table_header *table; - struct acpi_table_rsdp *rsdp; - unsigned long size = sizeof(*xsdt); - unsigned int i, j, num_tables = 0; - paddr_t xsdt_paddr; - int rc; - - /* - * Restore original DMAR table signature, we are going to filter it from - * the new XSDT that is presented to the guest, so it is no longer - * necessary to have it's signature zapped. - */ - acpi_dmar_reinstate(); - - /* Count the number of tables that will be added to the XSDT. */ - for( i = 0; i < acpi_gbl_root_table_list.count; i++ ) - { - const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; - - if ( pvh_acpi_table_allowed(sig) ) - num_tables++; - } - - /* - * No need to add or subtract anything because struct acpi_table_xsdt - * includes one array slot already, and we have filtered out the original - * MADT and we are going to add a custom built MADT. - */ - size += num_tables * sizeof(xsdt->table_offset_entry[0]); - - xsdt = xzalloc_bytes(size); - if ( !xsdt ) - { - printk("Unable to allocate memory for XSDT table\n"); - rc = -ENOMEM; - goto out; - } - - /* Copy the native XSDT table header. */ - rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(*rsdp)); - if ( !rsdp ) - { - printk("Unable to map RSDP\n"); - rc = -EINVAL; - goto out; - } - xsdt_paddr = rsdp->xsdt_physical_address; - acpi_os_unmap_memory(rsdp, sizeof(*rsdp)); - table = acpi_os_map_memory(xsdt_paddr, sizeof(*table)); - if ( !table ) - { - printk("Unable to map XSDT\n"); - rc = -EINVAL; - goto out; - } - xsdt->header = *table; - acpi_os_unmap_memory(table, sizeof(*table)); - - /* Add the custom MADT. */ - xsdt->table_offset_entry[0] = madt_addr; - - /* Copy the addresses of the rest of the allowed tables. */ - for( i = 0, j = 1; i < acpi_gbl_root_table_list.count; i++ ) - { - const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; - - if ( pvh_acpi_table_allowed(sig) ) - xsdt->table_offset_entry[j++] = - acpi_gbl_root_table_list.tables[i].address; - } - - xsdt->header.revision = 1; - xsdt->header.length = size; - /* - * Calling acpi_tb_checksum here is a layering violation, but - * introducing a wrapper for such simple usage seems overkill. - */ - xsdt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, xsdt), size); - - /* Place the new XSDT in guest memory space. */ - if ( pvh_steal_ram(d, size, 0, GB(4), addr) ) - { - printk("Unable to find guest RAM for XSDT\n"); - rc = -ENOMEM; - goto out; - } - - /* Mark this region as E820_ACPI. */ - if ( pvh_add_mem_range(d, *addr, *addr + size, E820_ACPI) ) - printk("Unable to add XSDT region to memory map\n"); - - rc = hvm_copy_to_guest_phys(*addr, xsdt, size, d->vcpu[0]); - if ( rc ) - { - printk("Unable to copy XSDT into guest memory\n"); - goto out; - } - - rc = 0; - - out: - xfree(xsdt); - - return rc; -} - -static int __init pvh_setup_acpi(struct domain *d, paddr_t start_info) -{ - unsigned long pfn, nr_pages; - paddr_t madt_paddr, xsdt_paddr, rsdp_paddr; - unsigned int i; - int rc; - struct acpi_table_rsdp *native_rsdp, rsdp = { - .signature = ACPI_SIG_RSDP, - .revision = 2, - .length = sizeof(rsdp), - }; - - - /* Scan top-level tables and add their regions to the guest memory map. */ - for( i = 0; i < acpi_gbl_root_table_list.count; i++ ) - { - const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; - unsigned long addr = acpi_gbl_root_table_list.tables[i].address; - unsigned long size = acpi_gbl_root_table_list.tables[i].length; - - /* - * Make sure the original MADT is also mapped, so that Dom0 can - * properly access the data returned by _MAT methods in case it's - * re-using MADT memory. - */ - if ( strncmp(sig, ACPI_SIG_MADT, ACPI_NAME_SIZE) - ? pvh_acpi_table_allowed(sig) - : !acpi_memory_banned(addr, size) ) - pvh_add_mem_range(d, addr, addr + size, E820_ACPI); - } - - /* Identity map ACPI e820 regions. */ - for ( i = 0; i < d->arch.nr_e820; i++ ) - { - if ( d->arch.e820[i].type != E820_ACPI && - d->arch.e820[i].type != E820_NVS ) - continue; - - pfn = PFN_DOWN(d->arch.e820[i].addr); - nr_pages = PFN_UP((d->arch.e820[i].addr & ~PAGE_MASK) + - d->arch.e820[i].size); - - rc = modify_identity_mmio(d, pfn, nr_pages, true); - if ( rc ) - { - printk("Failed to map ACPI region [%#lx, %#lx) into Dom0 memory map\n", - pfn, pfn + nr_pages); - return rc; - } - } - - rc = pvh_setup_acpi_madt(d, &madt_paddr); - if ( rc ) - return rc; - - rc = pvh_setup_acpi_xsdt(d, madt_paddr, &xsdt_paddr); - if ( rc ) - return rc; - - /* Craft a custom RSDP. */ - native_rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(rsdp)); - if ( !native_rsdp ) - { - printk("Failed to map native RSDP\n"); - return -ENOMEM; - } - memcpy(rsdp.oem_id, native_rsdp->oem_id, sizeof(rsdp.oem_id)); - acpi_os_unmap_memory(native_rsdp, sizeof(rsdp)); - rsdp.xsdt_physical_address = xsdt_paddr; - /* - * Calling acpi_tb_checksum here is a layering violation, but - * introducing a wrapper for such simple usage seems overkill. - */ - rsdp.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp), - ACPI_RSDP_REV0_SIZE); - rsdp.extended_checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp), - sizeof(rsdp)); - - /* - * Place the new RSDP in guest memory space. - * - * NB: this RSDP is not going to replace the original RSDP, which should - * still be accessible to the guest. However that RSDP is going to point to - * the native RSDT, and should not be used for the Dom0 kernel's boot - * purposes (we keep it visible for post boot access). - */ - if ( pvh_steal_ram(d, sizeof(rsdp), 0, GB(4), &rsdp_paddr) ) - { - printk("Unable to allocate guest RAM for RSDP\n"); - return -ENOMEM; - } - - /* Mark this region as E820_ACPI. */ - if ( pvh_add_mem_range(d, rsdp_paddr, rsdp_paddr + sizeof(rsdp), - E820_ACPI) ) - printk("Unable to add RSDP region to memory map\n"); - - /* Copy RSDP into guest memory. */ - rc = hvm_copy_to_guest_phys(rsdp_paddr, &rsdp, sizeof(rsdp), d->vcpu[0]); - if ( rc ) - { - printk("Unable to copy RSDP into guest memory\n"); - return rc; - } - - /* Copy RSDP address to start_info. */ - rc = hvm_copy_to_guest_phys(start_info + - offsetof(struct hvm_start_info, rsdp_paddr), - &rsdp_paddr, - sizeof(((struct hvm_start_info *) - 0)->rsdp_paddr), - d->vcpu[0]); - if ( rc ) - { - printk("Unable to copy RSDP into guest memory\n"); - return rc; - } - - return 0; -} - -static int __init construct_dom0_pvh(struct domain *d, const module_t *image, - unsigned long image_headroom, - module_t *initrd, - void *(*bootstrap_map)(const module_t *), - char *cmdline) -{ - paddr_t entry, start_info; - int rc; - - printk("** Building a PVH Dom0 **\n"); - - iommu_hwdom_init(d); - - rc = pvh_setup_p2m(d); - if ( rc ) - { - printk("Failed to setup Dom0 physical memory map\n"); - return rc; - } - - rc = pvh_load_kernel(d, image, image_headroom, initrd, bootstrap_map(image), - cmdline, &entry, &start_info); - if ( rc ) - { - printk("Failed to load Dom0 kernel\n"); - return rc; - } - - rc = pvh_setup_cpus(d, entry, start_info); - if ( rc ) - { - printk("Failed to setup Dom0 CPUs: %d\n", rc); - return rc; - } - - rc = pvh_setup_acpi(d, start_info); - if ( rc ) - { - printk("Failed to setup Dom0 ACPI tables: %d\n", rc); - return rc; - } - - panic("Building a PVHv2 Dom0 is not yet supported."); - return 0; -} - int __init construct_dom0(struct domain *d, const module_t *image, unsigned long image_headroom, module_t *initrd, void *(*bootstrap_map)(const module_t *), @@ -1549,7 +484,7 @@ int __init construct_dom0(struct domain *d, const module_t *image, process_pending_softirqs(); - return (is_hvm_domain(d) ? construct_dom0_pvh : dom0_construct_pv) + return (is_hvm_domain(d) ? dom0_construct_pvh : dom0_construct_pv) (d, image, image_headroom, initrd,bootstrap_map, cmdline); } diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile index ec0daae0ea..0a3d0f4f7e 100644 --- a/xen/arch/x86/hvm/Makefile +++ b/xen/arch/x86/hvm/Makefile @@ -3,6 +3,7 @@ subdir-y += vmx obj-y += asid.o obj-y += dm.o +obj-bin-y += dom0_build.init.o obj-y += emulate.o obj-y += hpet.o obj-y += hvm.o diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c new file mode 100644 index 0000000000..04a65899f5 --- /dev/null +++ b/xen/arch/x86/hvm/dom0_build.c @@ -0,0 +1,1114 @@ +/* + * hvm/dom0_build.c + * + * Dom0 builder for PVH guest. + * + * Copyright (C) 2017 Citrix Systems R&D + * + * This program is free software; you can redistribute it and/or + * modify it under the terms and conditions of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + +#include <xen/acpi.h> +#include <xen/init.h> +#include <xen/libelf.h> +#include <xen/multiboot.h> +#include <xen/softirq.h> + +#include <acpi/actables.h> + +#include <asm/bzimage.h> +#include <asm/dom0_build.h> +#include <asm/hvm/support.h> +#include <asm/io_apic.h> +#include <asm/p2m.h> +#include <asm/paging.h> +#include <asm/setup.h> + +#include <public/arch-x86/hvm/start_info.h> +#include <public/hvm/hvm_info_table.h> +#include <public/hvm/hvm_vcpu.h> + +/* + * Have the TSS cover the ISA port range, which makes it + * - 104 bytes base structure + * - 32 bytes interrupt redirection bitmap + * - 128 bytes I/O bitmap + * - one trailing byte + * or a total of 265 bytes. + * + * NB: as PVHv2 Dom0 doesn't have legacy devices (ISA), it shouldn't have any + * business in accessing the ISA port range, much less in real mode, and due to + * the lack of firmware it shouldn't also execute any INT instruction. This is + * done just for consistency with what hvmloader does. + */ +#define HVM_VM86_TSS_SIZE 265 + +static unsigned int __initdata acpi_intr_overrides; +static struct acpi_madt_interrupt_override __initdata *intsrcovr; + +static unsigned int __initdata acpi_nmi_sources; +static struct acpi_madt_nmi_source __initdata *nmisrc; + +static int __init modify_identity_mmio(struct domain *d, unsigned long pfn, + unsigned long nr_pages, const bool map) +{ + int rc; + + for ( ; ; ) + { + rc = (map ? map_mmio_regions : unmap_mmio_regions) + (d, _gfn(pfn), nr_pages, _mfn(pfn)); + if ( rc == 0 ) + break; + if ( rc < 0 ) + { + printk(XENLOG_WARNING + "Failed to identity %smap [%#lx,%#lx) for d%d: %d\n", + map ? "" : "un", pfn, pfn + nr_pages, d->domain_id, rc); + break; + } + nr_pages -= rc; + pfn += rc; + process_pending_softirqs(); + } + + return rc; +} + +/* Populate a HVM memory range using the biggest possible order. */ +static int __init pvh_populate_memory_range(struct domain *d, + unsigned long start, + unsigned long nr_pages) +{ + unsigned int order, i = 0; + struct page_info *page; + int rc; +#define MAP_MAX_ITER 64 + + order = MAX_ORDER; + while ( nr_pages != 0 ) + { + unsigned int range_order = get_order_from_pages(nr_pages + 1); + + order = min(range_order ? range_order - 1 : 0, order); + page = alloc_domheap_pages(d, order, dom0_memflags); + if ( page == NULL ) + { + if ( order == 0 && dom0_memflags ) + { + /* Try again without any dom0_memflags. */ + dom0_memflags = 0; + order = MAX_ORDER; + continue; + } + if ( order == 0 ) + { + printk("Unable to allocate memory with order 0!\n"); + return -ENOMEM; + } + order--; + continue; + } + + rc = guest_physmap_add_page(d, _gfn(start), _mfn(page_to_mfn(page)), + order); + if ( rc != 0 ) + { + printk("Failed to populate memory: [%#lx,%lx): %d\n", + start, start + (1UL << order), rc); + return -ENOMEM; + } + start += 1UL << order; + nr_pages -= 1UL << order; + if ( (++i % MAP_MAX_ITER) == 0 ) + process_pending_softirqs(); + } + + return 0; +#undef MAP_MAX_ITER +} + +/* Steal RAM from the end of a memory region. */ +static int __init pvh_steal_ram(struct domain *d, unsigned long size, + unsigned long align, paddr_t limit, + paddr_t *addr) +{ + unsigned int i = d->arch.nr_e820; + + /* + * Alignment 0 should be set to 1, so it doesn't wrap around in the + * calculations below. + */ + align = align ? : 1; + while ( i-- ) + { + struct e820entry *entry = &d->arch.e820[i]; + + if ( entry->type != E820_RAM || entry->addr + entry->size > limit || + entry->addr < MB(1) ) + continue; + + *addr = (entry->addr + entry->size - size) & ~(align - 1); + if ( *addr < entry->addr ) + continue; + + entry->size = *addr - entry->addr; + return 0; + } + + return -ENOMEM; +} + +/* NB: memory map must be sorted at all times for this to work correctly. */ +static int __init pvh_add_mem_range(struct domain *d, uint64_t s, uint64_t e, + unsigned int type) +{ + struct e820entry *map; + unsigned int i; + + for ( i = 0; i < d->arch.nr_e820; i++ ) + { + uint64_t rs = d->arch.e820[i].addr; + uint64_t re = rs + d->arch.e820[i].size; + + if ( rs == e && d->arch.e820[i].type == type ) + { + d->arch.e820[i].addr = s; + return 0; + } + + if ( re == s && d->arch.e820[i].type == type && + (i + 1 == d->arch.nr_e820 || d->arch.e820[i + 1].addr >= e) ) + { + d->arch.e820[i].size += e - s; + return 0; + } + + if ( rs >= e ) + break; + + if ( re > s ) + return -EEXIST; + } + + map = xzalloc_array(struct e820entry, d->arch.nr_e820 + 1); + if ( !map ) + { + printk(XENLOG_WARNING "E820: out of memory to add region\n"); + return -ENOMEM; + } + + memcpy(map, d->arch.e820, i * sizeof(*d->arch.e820)); + memcpy(map + i + 1, d->arch.e820 + i, + (d->arch.nr_e820 - i) * sizeof(*d->arch.e820)); + map[i].addr = s; + map[i].size = e - s; + map[i].type = type; + xfree(d->arch.e820); + d->arch.e820 = map; + d->arch.nr_e820++; + + return 0; +} + +static int __init pvh_setup_vmx_realmode_helpers(struct domain *d) +{ + p2m_type_t p2mt; + uint32_t rc, *ident_pt; + mfn_t mfn; + paddr_t gaddr; + struct vcpu *v = d->vcpu[0]; + + /* + * Steal some space from the last RAM region below 4GB and use it to + * store the real-mode TSS. It needs to be aligned to 128 so that the + * TSS structure (which accounts for the first 104b) doesn't cross + * a page boundary. + */ + if ( !pvh_steal_ram(d, HVM_VM86_TSS_SIZE, 128, GB(4), &gaddr) ) + { + if ( hvm_copy_to_guest_phys(gaddr, NULL, HVM_VM86_TSS_SIZE, v) != + HVMCOPY_okay ) + printk("Unable to zero VM86 TSS area\n"); + d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED] = + VM86_TSS_UPDATED | ((uint64_t)HVM_VM86_TSS_SIZE << 32) | gaddr; + if ( pvh_add_mem_range(d, gaddr, gaddr + HVM_VM86_TSS_SIZE, + E820_RESERVED) ) + printk("Unable to set VM86 TSS as reserved in the memory map\n"); + } + else + printk("Unable to allocate VM86 TSS area\n"); + + /* Steal some more RAM for the identity page tables. */ + if ( pvh_steal_ram(d, PAGE_SIZE, PAGE_SIZE, GB(4), &gaddr) ) + { + printk("Unable to find memory to stash the identity page tables\n"); + return -ENOMEM; + } + + /* + * Identity-map page table is required for running with CR0.PG=0 + * when using Intel EPT. Create a 32-bit non-PAE page directory of + * superpages. + */ + ident_pt = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)), + &mfn, &p2mt, 0, &rc); + if ( ident_pt == NULL ) + { + printk("Unable to map identity page tables\n"); + return -ENOMEM; + } + write_32bit_pse_identmap(ident_pt); + unmap_domain_page(ident_pt); + put_page(mfn_to_page(mfn_x(mfn))); + d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] = gaddr; + if ( pvh_add_mem_range(d, gaddr, gaddr + PAGE_SIZE, E820_RESERVED) ) + printk("Unable to set identity page tables as reserved in the memory map\n"); + + return 0; +} + +/* Assign the low 1MB to Dom0. */ +static void __init pvh_steal_low_ram(struct domain *d, unsigned long start, + unsigned long nr_pages) +{ + unsigned long mfn; + + ASSERT(start + nr_pages <= PFN_DOWN(MB(1))); + + for ( mfn = start; mfn < start + nr_pages; mfn++ ) + { + struct page_info *pg = mfn_to_page(mfn); + int rc; + + rc = unshare_xen_page_with_guest(pg, dom_io); + if ( rc ) + { + printk("Unable to unshare Xen mfn %#lx: %d\n", mfn, rc); + continue; + } + + share_xen_page_with_guest(pg, d, XENSHARE_writable); + rc = guest_physmap_add_entry(d, _gfn(mfn), _mfn(mfn), 0, p2m_ram_rw); + if ( rc ) + printk("Unable to add mfn %#lx to p2m: %d\n", mfn, rc); + } +} + +static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) +{ + struct e820entry *entry, *entry_guest; + unsigned int i; + unsigned long pages, cur_pages = 0; + uint64_t start, end; + + /* + * Craft the e820 memory map for Dom0 based on the hardware e820 map. + */ + d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map); + if ( !d->arch.e820 ) + panic("Unable to allocate memory for Dom0 e820 map"); + entry_guest = d->arch.e820; + + /* Clamp e820 memory map to match the memory assigned to Dom0 */ + for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) + { + if ( entry->type != E820_RAM ) + { + *entry_guest = *entry; + goto next; + } + + if ( nr_pages == cur_pages ) + { + /* + * We already have all the assigned memory, + * skip this entry + */ + continue; + } + + /* + * Make sure the start and length are aligned to PAGE_SIZE, because + * that's the minimum granularity of the 2nd stage translation. Since + * the p2m code uses PAGE_ORDER_4K internally, also use it here in + * order to prevent this code from getting out of sync. + */ + start = ROUNDUP(entry->addr, PAGE_SIZE << PAGE_ORDER_4K); + end = (entry->addr + entry->size) & + ~((PAGE_SIZE << PAGE_ORDER_4K) - 1); + if ( start >= end ) + continue; + + entry_guest->type = E820_RAM; + entry_guest->addr = start; + entry_guest->size = end - start; + pages = PFN_DOWN(entry_guest->size); + if ( (cur_pages + pages) > nr_pages ) + { + /* Truncate region */ + entry_guest->size = (nr_pages - cur_pages) << PAGE_SHIFT; + cur_pages = nr_pages; + } + else + { + cur_pages += pages; + } + next: + d->arch.nr_e820++; + entry_guest++; + } + ASSERT(cur_pages == nr_pages); + ASSERT(d->arch.nr_e820 <= e820.nr_map); +} + +static int __init pvh_setup_p2m(struct domain *d) +{ + struct vcpu *v = d->vcpu[0]; + unsigned long nr_pages; + unsigned int i; + int rc; + bool preempted; +#define MB1_PAGES PFN_DOWN(MB(1)) + + nr_pages = dom0_compute_nr_pages(d, NULL, 0); + + pvh_setup_e820(d, nr_pages); + do { + preempted = false; + paging_set_allocation(d, dom0_paging_pages(d, nr_pages), + &preempted); + process_pending_softirqs(); + } while ( preempted ); + + /* + * Memory below 1MB is identity mapped. + * NB: this only makes sense when booted from legacy BIOS. + */ + rc = modify_identity_mmio(d, 0, MB1_PAGES, true); + if ( rc ) + { + printk("Failed to identity map low 1MB: %d\n", rc); + return rc; + } + + /* Populate memory map. */ + for ( i = 0; i < d->arch.nr_e820; i++ ) + { + unsigned long addr, size; + + if ( d->arch.e820[i].type != E820_RAM ) + continue; + + addr = PFN_DOWN(d->arch.e820[i].addr); + size = PFN_DOWN(d->arch.e820[i].size); + + if ( addr >= MB1_PAGES ) + rc = pvh_populate_memory_range(d, addr, size); + else + { + ASSERT(addr + size < MB1_PAGES); + pvh_steal_low_ram(d, addr, size); + } + + if ( rc ) + return rc; + } + + if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) + { + /* + * Since Dom0 cannot be migrated, we will only setup the + * unrestricted guest helpers if they are needed by the current + * hardware we are running on. + */ + rc = pvh_setup_vmx_realmode_helpers(d); + if ( rc ) + return rc; + } + + return 0; +#undef MB1_PAGES +} + +static int __init pvh_load_kernel(struct domain *d, const module_t *image, + unsigned long image_headroom, + module_t *initrd, void *image_base, + char *cmdline, paddr_t *entry, + paddr_t *start_info_addr) +{ + void *image_start = image_base + image_headroom; + unsigned long image_len = image->mod_end; + struct elf_binary elf; + struct elf_dom_parms parms; + paddr_t last_addr; + struct hvm_start_info start_info = { 0 }; + struct hvm_modlist_entry mod = { 0 }; + struct vcpu *v = d->vcpu[0]; + int rc; + + if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 ) + { + printk("Error trying to detect bz compressed kernel\n"); + return rc; + } + + if ( (rc = elf_init(&elf, image_start, image_len)) != 0 ) + { + printk("Unable to init ELF\n"); + return rc; + } +#ifdef VERBOSE + elf_set_verbose(&elf); +#endif + elf_parse_binary(&elf); + if ( (rc = elf_xen_parse(&elf, &parms)) != 0 ) + { + printk("Unable to parse kernel for ELFNOTES\n"); + return rc; + } + + if ( parms.phys_entry == UNSET_ADDR32 ) + { + printk("Unable to find XEN_ELFNOTE_PHYS32_ENTRY address\n"); + return -EINVAL; + } + + printk("OS: %s version: %s loader: %s bitness: %s\n", parms.guest_os, + parms.guest_ver, parms.loader, + elf_64bit(&elf) ? "64-bit" : "32-bit"); + + /* Copy the OS image and free temporary buffer. */ + elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base); + elf.dest_size = parms.virt_kend - parms.virt_kstart; + + elf_set_vcpu(&elf, v); + rc = elf_load_binary(&elf); + if ( rc < 0 ) + { + printk("Failed to load kernel: %d\n", rc); + printk("Xen dom0 kernel broken ELF: %s\n", elf_check_broken(&elf)); + return rc; + } + + last_addr = ROUNDUP(parms.virt_kend - parms.virt_base, PAGE_SIZE); + + if ( initrd != NULL ) + { + rc = hvm_copy_to_guest_phys(last_addr, mfn_to_virt(initrd->mod_start), + initrd->mod_end, v); + if ( rc ) + { + printk("Unable to copy initrd to guest\n"); + return rc; + } + + mod.paddr = last_addr; + mod.size = initrd->mod_end; + last_addr += ROUNDUP(initrd->mod_end, PAGE_SIZE); + } + + /* Free temporary buffers. */ + discard_initial_images(); + + if ( cmdline != NULL ) + { + rc = hvm_copy_to_guest_phys(last_addr, cmdline, strlen(cmdline) + 1, v); + if ( rc ) + { + printk("Unable to copy guest command line\n"); + return rc; + } + start_info.cmdline_paddr = last_addr; + /* + * Round up to 32/64 bits (depending on the guest kernel bitness) so + * the modlist/start_info is aligned. + */ + last_addr += ROUNDUP(strlen(cmdline) + 1, elf_64bit(&elf) ? 8 : 4); + } + if ( initrd != NULL ) + { + rc = hvm_copy_to_guest_phys(last_addr, &mod, sizeof(mod), v); + if ( rc ) + { + printk("Unable to copy guest modules\n"); + return rc; + } + start_info.modlist_paddr = last_addr; + start_info.nr_modules = 1; + last_addr += sizeof(mod); + } + + start_info.magic = XEN_HVM_START_MAGIC_VALUE; + start_info.flags = SIF_PRIVILEGED | SIF_INITDOMAIN; + rc = hvm_copy_to_guest_phys(last_addr, &start_info, sizeof(start_info), v); + if ( rc ) + { + printk("Unable to copy start info to guest\n"); + return rc; + } + + *entry = parms.phys_entry; + *start_info_addr = last_addr; + + return 0; +} + +static int __init pvh_setup_cpus(struct domain *d, paddr_t entry, + paddr_t start_info) +{ + struct vcpu *v = d->vcpu[0]; + unsigned int cpu, i; + int rc; + /* + * This sets the vCPU state according to the state described in + * docs/misc/hvmlite.markdown. + */ + vcpu_hvm_context_t cpu_ctx = { + .mode = VCPU_HVM_MODE_32B, + .cpu_regs.x86_32.ebx = start_info, + .cpu_regs.x86_32.eip = entry, + .cpu_regs.x86_32.cr0 = X86_CR0_PE | X86_CR0_ET, + .cpu_regs.x86_32.cs_limit = ~0u, + .cpu_regs.x86_32.ds_limit = ~0u, + .cpu_regs.x86_32.ss_limit = ~0u, + .cpu_regs.x86_32.tr_limit = 0x67, + .cpu_regs.x86_32.cs_ar = 0xc9b, + .cpu_regs.x86_32.ds_ar = 0xc93, + .cpu_regs.x86_32.ss_ar = 0xc93, + .cpu_regs.x86_32.tr_ar = 0x8b, + }; + + cpu = v->processor; + for ( i = 1; i < d->max_vcpus; i++ ) + { + struct vcpu *p = dom0_setup_vcpu(d, i, cpu); + + if ( p ) + cpu = p->processor; + } + + rc = arch_set_info_hvm_guest(v, &cpu_ctx); + if ( rc ) + { + printk("Unable to setup Dom0 BSP context: %d\n", rc); + return rc; + } + + rc = dom0_setup_permissions(d); + if ( rc ) + { + panic("Unable to setup Dom0 permissions: %d\n", rc); + return rc; + } + + update_domain_wallclock_time(d); + + clear_bit(_VPF_down, &v->pause_flags); + + return 0; +} + +static int __init acpi_count_intr_ovr(struct acpi_subtable_header *header, + const unsigned long end) +{ + + acpi_intr_overrides++; + return 0; +} + +static int __init acpi_set_intr_ovr(struct acpi_subtable_header *header, + const unsigned long end) +{ + const struct acpi_madt_interrupt_override *intr = + container_of(header, struct acpi_madt_interrupt_override, header); + + *intsrcovr = *intr; + intsrcovr++; + + return 0; +} + +static int __init acpi_count_nmi_src(struct acpi_subtable_header *header, + const unsigned long end) +{ + + acpi_nmi_sources++; + return 0; +} + +static int __init acpi_set_nmi_src(struct acpi_subtable_header *header, + const unsigned long end) +{ + const struct acpi_madt_nmi_source *src = + container_of(header, struct acpi_madt_nmi_source, header); + + *nmisrc = *src; + nmisrc++; + + return 0; +} + +static int __init pvh_setup_acpi_madt(struct domain *d, paddr_t *addr) +{ + struct acpi_table_madt *madt; + struct acpi_table_header *table; + struct acpi_madt_io_apic *io_apic; + struct acpi_madt_local_x2apic *x2apic; + acpi_status status; + unsigned long size; + unsigned int i, max_vcpus; + int rc; + + /* Count number of interrupt overrides in the MADT. */ + acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, + acpi_count_intr_ovr, UINT_MAX); + + /* Count number of NMI sources in the MADT. */ + acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_count_nmi_src, + UINT_MAX); + + max_vcpus = dom0_max_vcpus(); + /* Calculate the size of the crafted MADT. */ + size = sizeof(*madt); + /* + * FIXME: the current vIO-APIC code just supports one IO-APIC instance + * per domain. This must be fixed in order to provide the same amount of + * IO APICs as available on bare metal. + */ + size += sizeof(*io_apic); + size += sizeof(*intsrcovr) * acpi_intr_overrides; + size += sizeof(*nmisrc) * acpi_nmi_sources; + size += sizeof(*x2apic) * max_vcpus; + + madt = xzalloc_bytes(size); + if ( !madt ) + { + printk("Unable to allocate memory for MADT table\n"); + rc = -ENOMEM; + goto out; + } + + /* Copy the native MADT table header. */ + status = acpi_get_table(ACPI_SIG_MADT, 0, &table); + if ( !ACPI_SUCCESS(status) ) + { + printk("Failed to get MADT ACPI table, aborting.\n"); + rc = -EINVAL; + goto out; + } + madt->header = *table; + madt->address = APIC_DEFAULT_PHYS_BASE; + /* + * NB: this is currently set to 4, which is the revision in the ACPI + * spec 6.1. Sadly ACPICA doesn't provide revision numbers for the + * tables described in the headers. + */ + madt->header.revision = min_t(unsigned char, table->revision, 4); + + /* + * Setup the IO APIC entry. + * FIXME: the current vIO-APIC code just supports one IO-APIC instance + * per domain. This must be fixed in order to provide the same amount of + * IO APICs as available on bare metal, and with the same IDs as found in + * the native IO APIC MADT entries. + */ + if ( nr_ioapics > 1 ) + printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 emulated IO APIC\n", + nr_ioapics); + io_apic = (void *)(madt + 1); + io_apic->header.type = ACPI_MADT_TYPE_IO_APIC; + io_apic->header.length = sizeof(*io_apic); + io_apic->id = domain_vioapic(d)->id; + io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS; + + x2apic = (void *)(io_apic + 1); + for ( i = 0; i < max_vcpus; i++ ) + { + x2apic->header.type = ACPI_MADT_TYPE_LOCAL_X2APIC; + x2apic->header.length = sizeof(*x2apic); + x2apic->uid = i; + x2apic->local_apic_id = i * 2; + x2apic->lapic_flags = ACPI_MADT_ENABLED; + x2apic++; + } + + /* Setup interrupt overrides. */ + intsrcovr = (void *)x2apic; + acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_set_intr_ovr, + acpi_intr_overrides); + + /* Setup NMI sources. */ + nmisrc = (void *)intsrcovr; + acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_set_nmi_src, + acpi_nmi_sources); + + ASSERT(((void *)nmisrc - (void *)madt) == size); + madt->header.length = size; + /* + * Calling acpi_tb_checksum here is a layering violation, but + * introducing a wrapper for such simple usage seems overkill. + */ + madt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, madt), size); + + /* Place the new MADT in guest memory space. */ + if ( pvh_steal_ram(d, size, 0, GB(4), addr) ) + { + printk("Unable to find allocate guest RAM for MADT\n"); + rc = -ENOMEM; + goto out; + } + + /* Mark this region as E820_ACPI. */ + if ( pvh_add_mem_range(d, *addr, *addr + size, E820_ACPI) ) + printk("Unable to add MADT region to memory map\n"); + + rc = hvm_copy_to_guest_phys(*addr, madt, size, d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy MADT into guest memory\n"); + goto out; + } + + rc = 0; + + out: + xfree(madt); + + return rc; +} + +static bool __init acpi_memory_banned(unsigned long address, + unsigned long size) +{ + unsigned long mfn, nr_pages, i; + + mfn = PFN_DOWN(address); + nr_pages = PFN_UP((address & ~PAGE_MASK) + size); + for ( i = 0 ; i < nr_pages; i++ ) + if ( !page_is_ram_type(mfn + i, RAM_TYPE_RESERVED) && + !page_is_ram_type(mfn + i, RAM_TYPE_ACPI) ) + return true; + + return false; +} + +static bool __init pvh_acpi_table_allowed(const char *sig) +{ + static const char __initconst banned_tables[][ACPI_NAME_SIZE] = { + ACPI_SIG_HPET, ACPI_SIG_SLIT, ACPI_SIG_SRAT, ACPI_SIG_MPST, + ACPI_SIG_PMTT, ACPI_SIG_MADT, ACPI_SIG_DMAR}; + unsigned int i; + + for ( i = 0 ; i < ARRAY_SIZE(banned_tables); i++ ) + if ( strncmp(sig, banned_tables[i], ACPI_NAME_SIZE) == 0 ) + return false; + + /* Make sure table doesn't reside in a RAM region. */ + if ( acpi_memory_banned(acpi_gbl_root_table_list.tables[i].address, + acpi_gbl_root_table_list.tables[i].length) ) + { + printk("Skipping table %.4s because resides in a non-ACPI, non-reserved region\n", + sig); + return false; + } + + return true; +} + +static int __init pvh_setup_acpi_xsdt(struct domain *d, paddr_t madt_addr, + paddr_t *addr) +{ + struct acpi_table_xsdt *xsdt; + struct acpi_table_header *table; + struct acpi_table_rsdp *rsdp; + unsigned long size = sizeof(*xsdt); + unsigned int i, j, num_tables = 0; + paddr_t xsdt_paddr; + int rc; + + /* + * Restore original DMAR table signature, we are going to filter it from + * the new XSDT that is presented to the guest, so it is no longer + * necessary to have it's signature zapped. + */ + acpi_dmar_reinstate(); + + /* Count the number of tables that will be added to the XSDT. */ + for( i = 0; i < acpi_gbl_root_table_list.count; i++ ) + { + const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; + + if ( pvh_acpi_table_allowed(sig) ) + num_tables++; + } + + /* + * No need to add or subtract anything because struct acpi_table_xsdt + * includes one array slot already, and we have filtered out the original + * MADT and we are going to add a custom built MADT. + */ + size += num_tables * sizeof(xsdt->table_offset_entry[0]); + + xsdt = xzalloc_bytes(size); + if ( !xsdt ) + { + printk("Unable to allocate memory for XSDT table\n"); + rc = -ENOMEM; + goto out; + } + + /* Copy the native XSDT table header. */ + rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(*rsdp)); + if ( !rsdp ) + { + printk("Unable to map RSDP\n"); + rc = -EINVAL; + goto out; + } + xsdt_paddr = rsdp->xsdt_physical_address; + acpi_os_unmap_memory(rsdp, sizeof(*rsdp)); + table = acpi_os_map_memory(xsdt_paddr, sizeof(*table)); + if ( !table ) + { + printk("Unable to map XSDT\n"); + rc = -EINVAL; + goto out; + } + xsdt->header = *table; + acpi_os_unmap_memory(table, sizeof(*table)); + + /* Add the custom MADT. */ + xsdt->table_offset_entry[0] = madt_addr; + + /* Copy the addresses of the rest of the allowed tables. */ + for( i = 0, j = 1; i < acpi_gbl_root_table_list.count; i++ ) + { + const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; + + if ( pvh_acpi_table_allowed(sig) ) + xsdt->table_offset_entry[j++] = + acpi_gbl_root_table_list.tables[i].address; + } + + xsdt->header.revision = 1; + xsdt->header.length = size; + /* + * Calling acpi_tb_checksum here is a layering violation, but + * introducing a wrapper for such simple usage seems overkill. + */ + xsdt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, xsdt), size); + + /* Place the new XSDT in guest memory space. */ + if ( pvh_steal_ram(d, size, 0, GB(4), addr) ) + { + printk("Unable to find guest RAM for XSDT\n"); + rc = -ENOMEM; + goto out; + } + + /* Mark this region as E820_ACPI. */ + if ( pvh_add_mem_range(d, *addr, *addr + size, E820_ACPI) ) + printk("Unable to add XSDT region to memory map\n"); + + rc = hvm_copy_to_guest_phys(*addr, xsdt, size, d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy XSDT into guest memory\n"); + goto out; + } + + rc = 0; + + out: + xfree(xsdt); + + return rc; +} + +static int __init pvh_setup_acpi(struct domain *d, paddr_t start_info) +{ + unsigned long pfn, nr_pages; + paddr_t madt_paddr, xsdt_paddr, rsdp_paddr; + unsigned int i; + int rc; + struct acpi_table_rsdp *native_rsdp, rsdp = { + .signature = ACPI_SIG_RSDP, + .revision = 2, + .length = sizeof(rsdp), + }; + + + /* Scan top-level tables and add their regions to the guest memory map. */ + for( i = 0; i < acpi_gbl_root_table_list.count; i++ ) + { + const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; + unsigned long addr = acpi_gbl_root_table_list.tables[i].address; + unsigned long size = acpi_gbl_root_table_list.tables[i].length; + + /* + * Make sure the original MADT is also mapped, so that Dom0 can + * properly access the data returned by _MAT methods in case it's + * re-using MADT memory. + */ + if ( strncmp(sig, ACPI_SIG_MADT, ACPI_NAME_SIZE) + ? pvh_acpi_table_allowed(sig) + : !acpi_memory_banned(addr, size) ) + pvh_add_mem_range(d, addr, addr + size, E820_ACPI); + } + + /* Identity map ACPI e820 regions. */ + for ( i = 0; i < d->arch.nr_e820; i++ ) + { + if ( d->arch.e820[i].type != E820_ACPI && + d->arch.e820[i].type != E820_NVS ) + continue; + + pfn = PFN_DOWN(d->arch.e820[i].addr); + nr_pages = PFN_UP((d->arch.e820[i].addr & ~PAGE_MASK) + + d->arch.e820[i].size); + + rc = modify_identity_mmio(d, pfn, nr_pages, true); + if ( rc ) + { + printk("Failed to map ACPI region [%#lx, %#lx) into Dom0 memory map\n", + pfn, pfn + nr_pages); + return rc; + } + } + + rc = pvh_setup_acpi_madt(d, &madt_paddr); + if ( rc ) + return rc; + + rc = pvh_setup_acpi_xsdt(d, madt_paddr, &xsdt_paddr); + if ( rc ) + return rc; + + /* Craft a custom RSDP. */ + native_rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(rsdp)); + if ( !native_rsdp ) + { + printk("Failed to map native RSDP\n"); + return -ENOMEM; + } + memcpy(rsdp.oem_id, native_rsdp->oem_id, sizeof(rsdp.oem_id)); + acpi_os_unmap_memory(native_rsdp, sizeof(rsdp)); + rsdp.xsdt_physical_address = xsdt_paddr; + /* + * Calling acpi_tb_checksum here is a layering violation, but + * introducing a wrapper for such simple usage seems overkill. + */ + rsdp.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp), + ACPI_RSDP_REV0_SIZE); + rsdp.extended_checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp), + sizeof(rsdp)); + + /* + * Place the new RSDP in guest memory space. + * + * NB: this RSDP is not going to replace the original RSDP, which should + * still be accessible to the guest. However that RSDP is going to point to + * the native RSDT, and should not be used for the Dom0 kernel's boot + * purposes (we keep it visible for post boot access). + */ + if ( pvh_steal_ram(d, sizeof(rsdp), 0, GB(4), &rsdp_paddr) ) + { + printk("Unable to allocate guest RAM for RSDP\n"); + return -ENOMEM; + } + + /* Mark this region as E820_ACPI. */ + if ( pvh_add_mem_range(d, rsdp_paddr, rsdp_paddr + sizeof(rsdp), + E820_ACPI) ) + printk("Unable to add RSDP region to memory map\n"); + + /* Copy RSDP into guest memory. */ + rc = hvm_copy_to_guest_phys(rsdp_paddr, &rsdp, sizeof(rsdp), d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy RSDP into guest memory\n"); + return rc; + } + + /* Copy RSDP address to start_info. */ + rc = hvm_copy_to_guest_phys(start_info + + offsetof(struct hvm_start_info, rsdp_paddr), + &rsdp_paddr, + sizeof(((struct hvm_start_info *) + 0)->rsdp_paddr), + d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy RSDP into guest memory\n"); + return rc; + } + + return 0; +} + +int __init dom0_construct_pvh(struct domain *d, const module_t *image, + unsigned long image_headroom, + module_t *initrd, + void *(*bootstrap_map)(const module_t *), + char *cmdline) +{ + paddr_t entry, start_info; + int rc; + + printk("** Building a PVH Dom0 **\n"); + + iommu_hwdom_init(d); + + rc = pvh_setup_p2m(d); + if ( rc ) + { + printk("Failed to setup Dom0 physical memory map\n"); + return rc; + } + + rc = pvh_load_kernel(d, image, image_headroom, initrd, bootstrap_map(image), + cmdline, &entry, &start_info); + if ( rc ) + { + printk("Failed to load Dom0 kernel\n"); + return rc; + } + + rc = pvh_setup_cpus(d, entry, start_info); + if ( rc ) + { + printk("Failed to setup Dom0 CPUs: %d\n", rc); + return rc; + } + + rc = pvh_setup_acpi(d, start_info); + if ( rc ) + { + printk("Failed to setup Dom0 ACPI tables: %d\n", rc); + return rc; + } + + panic("Building a PVHv2 Dom0 is not yet supported."); + return 0; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/include/asm-x86/dom0_build.h b/xen/include/asm-x86/dom0_build.h index 5d093673d9..d83d2b4387 100644 --- a/xen/include/asm-x86/dom0_build.h +++ b/xen/include/asm-x86/dom0_build.h @@ -20,6 +20,15 @@ int dom0_construct_pv(struct domain *d, const module_t *image, void *(*bootstrap_map)(const module_t *), char *cmdline); +int dom0_construct_pvh(struct domain *d, const module_t *image, + unsigned long image_headroom, + module_t *initrd, + void *(*bootstrap_map)(const module_t *), + char *cmdline); + +unsigned long dom0_paging_pages(const struct domain *d, + unsigned long nr_pages); + #endif /* _DOM0_BUILD_H_ */ /*

[v3,4/5] x86: split PVH dom0 builder to hvm/dom0_build.c

Commit Message

Comments

Patch