@@ -135,6 +135,7 @@ typedef struct PhysPageMap {
struct AddressSpaceDispatch {
struct rcu_head rcu;
+ MemoryRegionSection *mru_section;
/* This is a multi-level map on the physical address space.
* The bottom level has pointers to MemoryRegionSections.
*/
@@ -350,14 +351,25 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
hwaddr addr,
bool resolve_subpage)
{
- MemoryRegionSection *section;
+ MemoryRegionSection *section = atomic_read(&d->mru_section);
subpage_t *subpage;
+ bool update;
- section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
+ if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
+ section_covers_addr(section, addr)) {
+ update = false;
+ } else {
+ section = phys_page_find(d->phys_map, addr, d->map.nodes,
+ d->map.sections);
+ update = true;
+ }
if (resolve_subpage && section->mr->subpage) {
subpage = container_of(section->mr, subpage_t, iomem);
section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
}
+ if (update) {
+ atomic_set(&d->mru_section, section);
+ }
return section;
}
Under heavy workloads the lookup will likely end up with the same MemoryRegionSection from last time. Using a pointer to cache the result, like ram_list.mru_block, significantly reduces cost of address_space_translate. During address space topology update, as->dispatch will be reallocated so the pointer is invalidated automatically. Perf reports a visible drop on the cpu usage, because phys_page_find is not called. Before: 2.35% qemu-system-x86_64 [.] phys_page_find 0.97% qemu-system-x86_64 [.] address_space_translate_internal 0.95% qemu-system-x86_64 [.] address_space_translate 0.55% qemu-system-x86_64 [.] address_space_lookup_region After: 0.97% qemu-system-x86_64 [.] address_space_translate_internal 0.97% qemu-system-x86_64 [.] address_space_lookup_region 0.84% qemu-system-x86_64 [.] address_space_translate Signed-off-by: Fam Zheng <famz@redhat.com> --- exec.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-)