Message ID | 1402076021-9425-1-git-send-email-m.smarduch@samsung.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote: > kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch > changed this function, this patch picks up those changes, re-tested everything > works. Applies cleanly with other patches. > > This patch adds support for keeping track of VM dirty pages. As dirty page log > is retrieved, the pages that have been written are write protected again for > next write and log read. > > Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> > --- > arch/arm/include/asm/kvm_host.h | 3 ++ > arch/arm/kvm/arm.c | 5 --- > arch/arm/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++ > arch/x86/kvm/x86.c | 86 --------------------------------------- > virt/kvm/kvm_main.c | 86 +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 168 insertions(+), 91 deletions(-) > > diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h > index 59565f5..b760f9c 100644 > --- a/arch/arm/include/asm/kvm_host.h > +++ b/arch/arm/include/asm/kvm_host.h > @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); > int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); > > void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); > +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, > + struct kvm_memory_slot *slot, > + gfn_t gfn_offset, unsigned long mask); Do all other architectures implement this function? arm64? > > #endif /* __ARM_KVM_HOST_H__ */ > diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > index dfd63ac..f06fb21 100644 > --- a/arch/arm/kvm/arm.c > +++ b/arch/arm/kvm/arm.c > @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > } > } > > -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) > -{ > - return -EINVAL; > -} > - What about the other architectures implementing this function? > static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, > struct kvm_arm_device_addr *dev_addr) > { > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index e5dff85..907344c 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) > spin_unlock(&kvm->mmu_lock); > } > > +/** > + * stage2_wp_mask_range() - write protect memslot pages set in mask > + * @pmd - pointer to page table > + * @start_ipa - the start range of mask > + * @addr - start_ipa or start range of adjusted mask if crossing PMD range > + * @mask - mask of dirty pages > + * > + * Walk mask and write protect the associated dirty pages in the memory region. > + * If mask crosses a PMD range adjust it to next page table and return. > + */ > +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa, > + phys_addr_t *addr, unsigned long *mask) > +{ > + pte_t *pte; > + bool crosses_pmd; > + int i = __ffs(*mask); > + > + if (unlikely(*addr > start_ipa)) > + start_ipa = *addr - i * PAGE_SIZE; huh? > + pte = pte_offset_kernel(pmd, start_ipa); > + for (*addr = start_ipa + i * PAGE_SIZE; *mask; > + i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) { > + crosses_pmd = !!((start_ipa & PMD_MASK) ^ (*addr & PMD_MASK)); > + if (unlikely(crosses_pmd)) { > + /* Adjust mask dirty bits relative to next page table */ > + *mask >>= (PTRS_PER_PTE - pte_index(start_ipa)); > + return; > + } > + if (!pte_none(pte[i])) > + kvm_set_s2pte_readonly(&pte[i]); > + *mask &= ~(1 << i); This is *really* complicated, and *really* unintuitive and *really* hard to read! I feel this may very likely break, and is optimizing prematurely for some very special case. Can't you follow the usual scheme of traversing the levels one-by-one and just calculate the 'end' address based on the number of bits in your long, and just adjust the mask in the calling function each time you are about to call a lower-level function? In fact, I think this could be trivially implemented as an extension to your existing wp_range functions. On ARM you are mostly going to consider 32 pages, on arm64 you are mostly going to consider 64 pages, just calculate that range in terms of IPAs and set that as the limit for calling stage2_wp_pgd_range (which should be factor'ed out into its function and called from kvm_mmu_wp_memory_region). > + } > +} > + > +/** > + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask > + * @kvm: The KVM pointer > + * @slot: The memory slot associated with mask > + * @gfn_offset: The gfn offset in memory slot > + * @mask: The mask of dirty pages at offset 'gnf_offset' in this memory s/gnf_offset/gfn_offset/ > + * slot to be write protected > + * > + * Called from dirty page logging read function to write protect bits set in > + * mask to record future writes to these pages in dirty page log. This function > + * uses simplified page table walk given mask can spawn no more then 2 PMD random double white space before mask > + * table range. > + * 'kvm->mmu_lock' must be held to protect against concurrent modification > + * of page tables (2nd stage fault, mmu modifiers, ...) > + * > + */ > +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, > + struct kvm_memory_slot *slot, > + gfn_t gfn_offset, unsigned long mask) > +{ > + pud_t *pud; > + pmd_t *pmd; > + phys_addr_t start_ipa = (slot->base_gfn + gfn_offset) << PAGE_SHIFT; > + phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE; > + phys_addr_t addr = start_ipa; > + pgd_t *pgdp = kvm->arch.pgd, *pgd; > + > + do { > + pgd = pgdp + pgd_index(addr); > + if (pgd_present(*pgd)) { > + pud = pud_offset(pgd, addr); > + if (!pud_none(*pud) && !pud_huge(*pud)) { > + pmd = pmd_offset(pud, addr); > + if (!pmd_none(*pmd) && !kvm_pmd_huge(*pmd)) > + stage2_wp_mask_range(pmd, start_ipa, > + &addr, &mask); > + else > + addr += PMD_SIZE; > + } else > + addr += PUD_SIZE; is this correct? what if your gfn_offset puts you at the last page of a PUD, don't you need pud_addr_end() instead? > + } else > + addr += PGDIR_SIZE; please use braces for both of the single-line else-clauses above when the main if-clause is multi-line (see Documentation/CodingStyle chapter 3, just before 3.1). > + } while (mask && addr < end_ipa); this seems like a complicated loop condition. It seems to me that either you clear all the bits in the mask you want to check or you check for an address range, why is there a need for both? > +} > + > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > struct kvm_memory_slot *memslot, > unsigned long fault_status) [...] Thanks, -Christoffer
On 06/08/2014 05:05 AM, Christoffer Dall wrote: > On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote: >> kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch >> changed this function, this patch picks up those changes, re-tested everything >> works. Applies cleanly with other patches. >> >> This patch adds support for keeping track of VM dirty pages. As dirty page log >> is retrieved, the pages that have been written are write protected again for >> next write and log read. >> >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> >> --- >> arch/arm/include/asm/kvm_host.h | 3 ++ >> arch/arm/kvm/arm.c | 5 --- >> arch/arm/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++ >> arch/x86/kvm/x86.c | 86 --------------------------------------- >> virt/kvm/kvm_main.c | 86 +++++++++++++++++++++++++++++++++++++++ >> 5 files changed, 168 insertions(+), 91 deletions(-) >> >> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h >> index 59565f5..b760f9c 100644 >> --- a/arch/arm/include/asm/kvm_host.h >> +++ b/arch/arm/include/asm/kvm_host.h >> @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); >> int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); >> >> void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); >> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, >> + struct kvm_memory_slot *slot, >> + gfn_t gfn_offset, unsigned long mask); > > Do all other architectures implement this function? arm64? Besides arm, x86 but the function is not generic. > >> >> #endif /* __ARM_KVM_HOST_H__ */ >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c >> index dfd63ac..f06fb21 100644 >> --- a/arch/arm/kvm/arm.c >> +++ b/arch/arm/kvm/arm.c >> @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, >> } >> } >> >> -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) >> -{ >> - return -EINVAL; >> -} >> - > > What about the other architectures implementing this function? Six architectures define this function. With this patch this function is generic in kvm_main.c used by x86. > >> static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, >> struct kvm_arm_device_addr *dev_addr) >> { >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index e5dff85..907344c 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) >> spin_unlock(&kvm->mmu_lock); >> } >> >> +/** >> + * stage2_wp_mask_range() - write protect memslot pages set in mask >> + * @pmd - pointer to page table >> + * @start_ipa - the start range of mask >> + * @addr - start_ipa or start range of adjusted mask if crossing PMD range >> + * @mask - mask of dirty pages >> + * >> + * Walk mask and write protect the associated dirty pages in the memory region. >> + * If mask crosses a PMD range adjust it to next page table and return. >> + */ >> +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa, >> + phys_addr_t *addr, unsigned long *mask) >> +{ >> + pte_t *pte; >> + bool crosses_pmd; >> + int i = __ffs(*mask); >> + >> + if (unlikely(*addr > start_ipa)) >> + start_ipa = *addr - i * PAGE_SIZE; > > huh? > >> + pte = pte_offset_kernel(pmd, start_ipa); >> + for (*addr = start_ipa + i * PAGE_SIZE; *mask; >> + i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) { >> + crosses_pmd = !!((start_ipa & PMD_MASK) ^ (*addr & PMD_MASK)); >> + if (unlikely(crosses_pmd)) { >> + /* Adjust mask dirty bits relative to next page table */ >> + *mask >>= (PTRS_PER_PTE - pte_index(start_ipa)); >> + return; >> + } >> + if (!pte_none(pte[i])) >> + kvm_set_s2pte_readonly(&pte[i]); >> + *mask &= ~(1 << i); > > This is *really* complicated, and *really* unintuitive and *really* hard > to read! > > I feel this may very likely break, and is optimizing prematurely for > some very special case. Can't you follow the usual scheme of traversing > the levels one-by-one and just calculate the 'end' address based on the > number of bits in your long, and just adjust the mask in the calling > function each time you are about to call a lower-level function? Agreed I'll extend wp_range functions, it probably makes no sense to be optimizing at this phase. > > In fact, I think this could be trivially implemented as an extension to > your existing wp_range functions. On ARM you are mostly going to > consider 32 pages, on arm64 you are mostly going to consider 64 pages, > just calculate that range in terms of IPAs and set that as the limit for > calling stage2_wp_pgd_range (which should be factor'ed out into its > function and called from kvm_mmu_wp_memory_region). > > > >> + } >> +} >> + >> +/** >> + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask >> + * @kvm: The KVM pointer >> + * @slot: The memory slot associated with mask >> + * @gfn_offset: The gfn offset in memory slot >> + * @mask: The mask of dirty pages at offset 'gnf_offset' in this memory > > s/gnf_offset/gfn_offset/ ok > >> + * slot to be write protected >> + * >> + * Called from dirty page logging read function to write protect bits set in >> + * mask to record future writes to these pages in dirty page log. This function >> + * uses simplified page table walk given mask can spawn no more then 2 PMD > > random double white space before mask ok > >> + * table range. >> + * 'kvm->mmu_lock' must be held to protect against concurrent modification >> + * of page tables (2nd stage fault, mmu modifiers, ...) >> + * >> + */ >> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, >> + struct kvm_memory_slot *slot, >> + gfn_t gfn_offset, unsigned long mask) >> +{ >> + pud_t *pud; >> + pmd_t *pmd; >> + phys_addr_t start_ipa = (slot->base_gfn + gfn_offset) << PAGE_SHIFT; >> + phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE; >> + phys_addr_t addr = start_ipa; >> + pgd_t *pgdp = kvm->arch.pgd, *pgd; >> + >> + do { >> + pgd = pgdp + pgd_index(addr); >> + if (pgd_present(*pgd)) { >> + pud = pud_offset(pgd, addr); >> + if (!pud_none(*pud) && !pud_huge(*pud)) { >> + pmd = pmd_offset(pud, addr); >> + if (!pmd_none(*pmd) && !kvm_pmd_huge(*pmd)) >> + stage2_wp_mask_range(pmd, start_ipa, >> + &addr, &mask); >> + else >> + addr += PMD_SIZE; >> + } else >> + addr += PUD_SIZE; > > is this correct? what if your gfn_offset puts you at the last page of a > PUD, don't you need pud_addr_end() instead? > >> + } else >> + addr += PGDIR_SIZE; > > please use braces for both of the single-line else-clauses above when > the main if-clause is multi-line (see Documentation/CodingStyle chapter > 3, just before 3.1). > >> + } while (mask && addr < end_ipa); > > this seems like a complicated loop condition. It seems to me that > either you clear all the bits in the mask you want to check or you check > for an address range, why is there a need for both? > >> +} >> + >> static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >> struct kvm_memory_slot *memslot, >> unsigned long fault_status) > > [...] > > Thanks, > -Christoffer >
On Mon, Jun 09, 2014 at 06:47:12PM -0700, Mario Smarduch wrote: > On 06/08/2014 05:05 AM, Christoffer Dall wrote: > > On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote: > >> kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch > >> changed this function, this patch picks up those changes, re-tested everything > >> works. Applies cleanly with other patches. > >> > >> This patch adds support for keeping track of VM dirty pages. As dirty page log > >> is retrieved, the pages that have been written are write protected again for > >> next write and log read. > >> > >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> > >> --- > >> arch/arm/include/asm/kvm_host.h | 3 ++ > >> arch/arm/kvm/arm.c | 5 --- > >> arch/arm/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++ > >> arch/x86/kvm/x86.c | 86 --------------------------------------- > >> virt/kvm/kvm_main.c | 86 +++++++++++++++++++++++++++++++++++++++ > >> 5 files changed, 168 insertions(+), 91 deletions(-) > >> > >> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h > >> index 59565f5..b760f9c 100644 > >> --- a/arch/arm/include/asm/kvm_host.h > >> +++ b/arch/arm/include/asm/kvm_host.h > >> @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); > >> int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); > >> > >> void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); > >> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, > >> + struct kvm_memory_slot *slot, > >> + gfn_t gfn_offset, unsigned long mask); > > > > Do all other architectures implement this function? arm64? > > Besides arm, x86 but the function is not generic. > > you're now calling this from generic code, so all architecture must implement it, and the prototype should proably be in include/linux/kvm_host.h, not in the arch-specific headers. > >> > >> #endif /* __ARM_KVM_HOST_H__ */ > >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > >> index dfd63ac..f06fb21 100644 > >> --- a/arch/arm/kvm/arm.c > >> +++ b/arch/arm/kvm/arm.c > >> @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > >> } > >> } > >> > >> -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) > >> -{ > >> - return -EINVAL; > >> -} > >> - > > > > What about the other architectures implementing this function? > > Six architectures define this function. With this patch this > function is generic in kvm_main.c used by x86. But you're not defining it as a weak symbol (and I don't suspect that you should unless other archs do this in a *very* different way), so you need to either remove it from the other archs, make it a weak symbol (I hope this is not the case) or do something else. -Christoffer
On 06/10/2014 02:22 AM, Christoffer Dall wrote: > On Mon, Jun 09, 2014 at 06:47:12PM -0700, Mario Smarduch wrote: >> On 06/08/2014 05:05 AM, Christoffer Dall wrote: >>> On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote: >>>> kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch >>>> changed this function, this patch picks up those changes, re-tested everything >>>> works. Applies cleanly with other patches. >>>> >>>> This patch adds support for keeping track of VM dirty pages. As dirty page log >>>> is retrieved, the pages that have been written are write protected again for >>>> next write and log read. >>>> >>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> >>>> --- >>>> arch/arm/include/asm/kvm_host.h | 3 ++ >>>> arch/arm/kvm/arm.c | 5 --- >>>> arch/arm/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++ >>>> arch/x86/kvm/x86.c | 86 --------------------------------------- >>>> virt/kvm/kvm_main.c | 86 +++++++++++++++++++++++++++++++++++++++ >>>> 5 files changed, 168 insertions(+), 91 deletions(-) >>>> >>>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h >>>> index 59565f5..b760f9c 100644 >>>> --- a/arch/arm/include/asm/kvm_host.h >>>> +++ b/arch/arm/include/asm/kvm_host.h >>>> @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); >>>> int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); >>>> >>>> void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); >>>> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, >>>> + struct kvm_memory_slot *slot, >>>> + gfn_t gfn_offset, unsigned long mask); >>> >>> Do all other architectures implement this function? arm64? >> >> Besides arm, x86 but the function is not generic. >>> > > you're now calling this from generic code, so all architecture must > implement it, and the prototype should proably be in > include/linux/kvm_host.h, not in the arch-specific headers. Ah ok. > >>>> >>>> #endif /* __ARM_KVM_HOST_H__ */ >>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c >>>> index dfd63ac..f06fb21 100644 >>>> --- a/arch/arm/kvm/arm.c >>>> +++ b/arch/arm/kvm/arm.c >>>> @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, >>>> } >>>> } >>>> >>>> -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) >>>> -{ >>>> - return -EINVAL; >>>> -} >>>> - >>> >>> What about the other architectures implementing this function? >> >> Six architectures define this function. With this patch this >> function is generic in kvm_main.c used by x86. > > But you're not defining it as a weak symbol (and I don't suspect that > you should unless other archs do this in a *very* different way), so you > need to either remove it from the other archs, make it a weak symbol (I > hope this is not the case) or do something else. Mistake on my part I just cut and paste Xiaos x86's recent upstream patch and didn't add weak definition. I looked at IA64, MIPS (two of them ), S390 somewhat similar but quite different implementations. They use a sync version, where the dirty bitmaps are maintained at arch level and then copied to memslot->dirty_bitmap. There is only commonality between x86 and ARM right now, x86 uses memslot->dirty_bitmap directly. Maybe this function should go back to architecture layer, it's unlikely it can become generic across all architectures. There is also the issue of kvm_flush_remote_tlbs(), that's also weak, the generic one is using IPIs. Since it's only used in mmu.c maybe make this one static. > > -Christoffer >
On Tue, Jun 10, 2014 at 11:08:24AM -0700, Mario Smarduch wrote: > On 06/10/2014 02:22 AM, Christoffer Dall wrote: > > On Mon, Jun 09, 2014 at 06:47:12PM -0700, Mario Smarduch wrote: > >> On 06/08/2014 05:05 AM, Christoffer Dall wrote: > >>> On Fri, Jun 06, 2014 at 10:33:41AM -0700, Mario Smarduch wrote: > >>>> kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch > >>>> changed this function, this patch picks up those changes, re-tested everything > >>>> works. Applies cleanly with other patches. > >>>> > >>>> This patch adds support for keeping track of VM dirty pages. As dirty page log > >>>> is retrieved, the pages that have been written are write protected again for > >>>> next write and log read. > >>>> > >>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> > >>>> --- > >>>> arch/arm/include/asm/kvm_host.h | 3 ++ > >>>> arch/arm/kvm/arm.c | 5 --- > >>>> arch/arm/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++ > >>>> arch/x86/kvm/x86.c | 86 --------------------------------------- > >>>> virt/kvm/kvm_main.c | 86 +++++++++++++++++++++++++++++++++++++++ > >>>> 5 files changed, 168 insertions(+), 91 deletions(-) > >>>> > >>>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h > >>>> index 59565f5..b760f9c 100644 > >>>> --- a/arch/arm/include/asm/kvm_host.h > >>>> +++ b/arch/arm/include/asm/kvm_host.h > >>>> @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); > >>>> int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); > >>>> > >>>> void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); > >>>> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, > >>>> + struct kvm_memory_slot *slot, > >>>> + gfn_t gfn_offset, unsigned long mask); > >>> > >>> Do all other architectures implement this function? arm64? > >> > >> Besides arm, x86 but the function is not generic. > >>> > > > > you're now calling this from generic code, so all architecture must > > implement it, and the prototype should proably be in > > include/linux/kvm_host.h, not in the arch-specific headers. > Ah ok. > > > >>>> > >>>> #endif /* __ARM_KVM_HOST_H__ */ > >>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c > >>>> index dfd63ac..f06fb21 100644 > >>>> --- a/arch/arm/kvm/arm.c > >>>> +++ b/arch/arm/kvm/arm.c > >>>> @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > >>>> } > >>>> } > >>>> > >>>> -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) > >>>> -{ > >>>> - return -EINVAL; > >>>> -} > >>>> - > >>> > >>> What about the other architectures implementing this function? > >> > >> Six architectures define this function. With this patch this > >> function is generic in kvm_main.c used by x86. > > > > But you're not defining it as a weak symbol (and I don't suspect that > > you should unless other archs do this in a *very* different way), so you > > need to either remove it from the other archs, make it a weak symbol (I > > hope this is not the case) or do something else. > Mistake on my part I just cut and paste Xiaos x86's recent upstream patch and > didn't add weak definition. > > I looked at IA64, MIPS (two of them ), S390 somewhat similar but quite > different implementations. They use a sync version, where the dirty bitmaps > are maintained at arch level and then copied to memslot->dirty_bitmap. There > is only commonality between x86 and ARM right now, x86 uses > memslot->dirty_bitmap directly. > > Maybe this function should go back to architecture layer, it's > unlikely it can become generic across all architectures. > > There is also the issue of kvm_flush_remote_tlbs(), that's also weak, > the generic one is using IPIs. Since it's only used in mmu.c maybe make > this one static. > So I don't see a lot of use of weak symbols in kvm_main.c (actually on kvmarm/next I don't see any), but we do want to share code when more than one architecture implements something in the exact same way, like it seems x86 and ARM is doing here for this particular function. I think the KVM scheme is usually to check for some define, like: #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG ret = kvm_arch_get_dirty_log(...); #else ret = kvm_get_dirty_log(...); #endif but Paolo may have a more informed oppinion of how to deal with these. Thanks, -Christoffer
Hi Paolo, for ARM dirty page logging we have couple functions that are generic. - kvm_vm_ioctl_get_dirty_log - is identical to x86 version - kvm_flush_remote_tlbs - ARM version does hardware broadcast it's different from the generic one in kvm_main.c How to proceed to make these generic? Please see below from Christoffer. Current patch moves kvm_vm_ioctl_get_dirty_log() into kvm_main.c and labels it and kvm_flush_remote_tlbs weak. Please advise. Thanks, - Mario > So I don't see a lot of use of weak symbols in kvm_main.c (actually on > kvmarm/next I don't see any), but we do want to share code when more > than one architecture implements something in the exact same way, like > it seems x86 and ARM is doing here for this particular function. > > I think the KVM scheme is usually to check for some define, like: > > #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG > ret = kvm_arch_get_dirty_log(...); > #else > ret = kvm_get_dirty_log(...); > #endif > > but Paolo may have a more informed oppinion of how to deal with these. > > Thanks, > -Christoffer >
On 06/11/2014 12:03 AM, Christoffer Dall wrote: >> >> There is also the issue of kvm_flush_remote_tlbs(), that's also weak, >> the generic one is using IPIs. Since it's only used in mmu.c maybe make >> this one static. >> > So I don't see a lot of use of weak symbols in kvm_main.c (actually on > kvmarm/next I don't see any), but we do want to share code when more > than one architecture implements something in the exact same way, like > it seems x86 and ARM is doing here for this particular function. > > I think the KVM scheme is usually to check for some define, like: > > #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG > ret = kvm_arch_get_dirty_log(...); > #else > ret = kvm_get_dirty_log(...); > #endif > > but Paolo may have a more informed oppinion of how to deal with these. > > Thanks, > -Christoffer > One approach I'm trying looking at the code in kvm_main(). This approach applies more to selecting features as opposed to selecting generic vs architecture specific functions. 1.------------------------------------------------- - add to 'virt/kvm/Kconfig' config HAVE_KVM_ARCH_TLB_FLUSH_ALL bool config HAVE_KVM_ARCH_DIRTY_LOG bool 2.-------------------------------------------------- For ARM and later ARM64 add to 'arch/arm[64]/kvm/Kconfig' config KVM bool "Kernel-based Virtual Machine (KVM) support" ... select HAVE_KVM_ARCH_TLB_FLUSH_ALL .. Not for HAVE_KVM_ARCH_DIRTY_LOG given it's shared with x86, but would need to do it for every other architecture that does not share it (except initially for arm64 since it will use the variant that returns -EINVAL until feature is supported) 3------------------------------------------------------ In kvm_main.c would have something like void kvm_flush_remote_tlbs(struct kvm *kvm) { #ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL kvm_arch_flush_remote_tlbs(kvm); #else long dirty_count = kvm->tlbs_dirty; smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); #endif } Then add void kvm_flush_remote_tlbs(struct kvm *kvm) definition to arm kvm_host.h. Define the function in this case mmu.c For the dirty log function int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { #ifdef CONFIG_HAVE_KVM_ARCH_DIRTY_LOG kvm_arch_vm_ioctl_get_dirty_log(kvm, log); #else int r; struct kvm_memory_slot *memslot; unsigned long n, i; unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_buffer; bool is_dirty = false; ... But then you have to go into every architecture and define the kvm_arch_vm_...() variant. Is this the right way to go? Or is there a simpler way? Thanks, - Mario
On Tue, Jun 17, 2014 at 06:41:52PM -0700, Mario Smarduch wrote: > On 06/11/2014 12:03 AM, Christoffer Dall wrote: > > >> > >> There is also the issue of kvm_flush_remote_tlbs(), that's also weak, > >> the generic one is using IPIs. Since it's only used in mmu.c maybe make > >> this one static. > >> > > So I don't see a lot of use of weak symbols in kvm_main.c (actually on > > kvmarm/next I don't see any), but we do want to share code when more > > than one architecture implements something in the exact same way, like > > it seems x86 and ARM is doing here for this particular function. > > > > I think the KVM scheme is usually to check for some define, like: > > > > #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG > > ret = kvm_arch_get_dirty_log(...); > > #else > > ret = kvm_get_dirty_log(...); > > #endif > > > > but Paolo may have a more informed oppinion of how to deal with these. > > > > Thanks, > > -Christoffer > > > > > One approach I'm trying looking at the code in kvm_main(). > This approach applies more to selecting features as opposed to > selecting generic vs architecture specific functions. > > 1.------------------------------------------------- > - add to 'virt/kvm/Kconfig' > config HAVE_KVM_ARCH_TLB_FLUSH_ALL > bool > > config HAVE_KVM_ARCH_DIRTY_LOG > bool > 2.-------------------------------------------------- > For ARM and later ARM64 add to 'arch/arm[64]/kvm/Kconfig' > config KVM > bool "Kernel-based Virtual Machine (KVM) support" > ... > select HAVE_KVM_ARCH_TLB_FLUSH_ALL > .. > > Not for HAVE_KVM_ARCH_DIRTY_LOG given it's shared with x86, > but would need to do it for every other architecture that > does not share it (except initially for arm64 since it > will use the variant that returns -EINVAL until feature > is supported) > > 3------------------------------------------------------ > In kvm_main.c would have something like > > void kvm_flush_remote_tlbs(struct kvm *kvm) > { > #ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL > kvm_arch_flush_remote_tlbs(kvm); > #else > long dirty_count = kvm->tlbs_dirty; > > smp_mb(); > if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) > ++kvm->stat.remote_tlb_flush; > cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); > #endif > } > > Then add void kvm_flush_remote_tlbs(struct kvm *kvm) definition > to arm kvm_host.h. Define the function in this case mmu.c > > For the dirty log function > int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, > struct kvm_dirty_log *log) > { > #ifdef CONFIG_HAVE_KVM_ARCH_DIRTY_LOG > kvm_arch_vm_ioctl_get_dirty_log(kvm, log); > #else > int r; > struct kvm_memory_slot *memslot; > unsigned long n, i; > unsigned long *dirty_bitmap; > unsigned long *dirty_bitmap_buffer; > bool is_dirty = false; > ... > > But then you have to go into every architecture and define the > kvm_arch_vm_...() variant. > > Is this the right way to go? Or is there a simpler way? > Hmmm, I'm really not an expert in the 'established procedures' for what to put in config files etc., but here's my basic take: a) you wouldn't put a config option in Kconfig unless it's comething that's actually configurable or some generic feature/subsystem that should only be enabled if hardware has certain capabilities or other config options enabled. b) this seems entirely an implementation issue and not depending on anything users should select. c) therefore, I think it's either a question of always having an arch-specific implementation that you probe for its return value or you have some sort of define in the header files for the arch/X/include/asm/kvm_host.h to control what you need. -Christoffer
Il 03/07/2014 17:04, Christoffer Dall ha scritto: > Hmmm, I'm really not an expert in the 'established procedures' for what > to put in config files etc., but here's my basic take: > > a) you wouldn't put a config option in Kconfig unless it's comething > that's actually configurable or some generic feature/subsystem that > should only be enabled if hardware has certain capabilities or other > config options enabled. > > b) this seems entirely an implementation issue and not depending on > anything users should select. Actually I think Mario's idea is just fine. Non-user-accessible Kconfig symbols are used a lot to invoke an #ifdef elsewhere in the code; compare this with his proposal is a bit different but not too much. Sometimes #defines are used, sometimes Kconfig symbols, but the idea is the same. Paolo
On 07/04/2014 09:29 AM, Paolo Bonzini wrote: > Il 03/07/2014 17:04, Christoffer Dall ha scritto: >> Hmmm, I'm really not an expert in the 'established procedures' for what >> to put in config files etc., but here's my basic take: >> >> a) you wouldn't put a config option in Kconfig unless it's comething >> that's actually configurable or some generic feature/subsystem that >> should only be enabled if hardware has certain capabilities or other >> config options enabled. >> >> b) this seems entirely an implementation issue and not depending on >> anything users should select. > > Actually I think Mario's idea is just fine. Non-user-accessible Kconfig > symbols are used a lot to invoke an #ifdef elsewhere in the code; > compare this with his proposal is a bit different but not too much. > > Sometimes #defines are used, sometimes Kconfig symbols, but the idea is > the same. > > Paolo Hi Paolo, thanks for your feedback. I forgot to add that I tried define ARCH_HAVE_... approach but checkpatch rejected it and insisted on Kconfig. Thanks, - Mario
Hi Christoffer, Just back from holiday - a short plan to resume work. - move VM tlb flush and kvm log functions to generic, per Paolo's comments use Kconfig approach - update other architectures make sure they compile - Keep it ARMv7 for now Get maintainers to test the branch. In parallel add dirty log support to ARMv8, to test I would add a QEMU monitor function to validate general operation. Your thoughts? Thanks, Mario On 07/03/2014 08:04 AM, Christoffer Dall wrote: > On Tue, Jun 17, 2014 at 06:41:52PM -0700, Mario Smarduch wrote: >> On 06/11/2014 12:03 AM, Christoffer Dall wrote: >> >>>> >>>> There is also the issue of kvm_flush_remote_tlbs(), that's also weak, >>>> the generic one is using IPIs. Since it's only used in mmu.c maybe make >>>> this one static. >>>> >>> So I don't see a lot of use of weak symbols in kvm_main.c (actually on >>> kvmarm/next I don't see any), but we do want to share code when more >>> than one architecture implements something in the exact same way, like >>> it seems x86 and ARM is doing here for this particular function. >>> >>> I think the KVM scheme is usually to check for some define, like: >>> >>> #ifdef KVM_ARCH_HAVE_GET_DIRTY_LOG >>> ret = kvm_arch_get_dirty_log(...); >>> #else >>> ret = kvm_get_dirty_log(...); >>> #endif >>> >>> but Paolo may have a more informed oppinion of how to deal with these. >>> >>> Thanks, >>> -Christoffer >>> >> >> >> One approach I'm trying looking at the code in kvm_main(). >> This approach applies more to selecting features as opposed to >> selecting generic vs architecture specific functions. >> >> 1.------------------------------------------------- >> - add to 'virt/kvm/Kconfig' >> config HAVE_KVM_ARCH_TLB_FLUSH_ALL >> bool >> >> config HAVE_KVM_ARCH_DIRTY_LOG >> bool >> 2.-------------------------------------------------- >> For ARM and later ARM64 add to 'arch/arm[64]/kvm/Kconfig' >> config KVM >> bool "Kernel-based Virtual Machine (KVM) support" >> ... >> select HAVE_KVM_ARCH_TLB_FLUSH_ALL >> .. >> >> Not for HAVE_KVM_ARCH_DIRTY_LOG given it's shared with x86, >> but would need to do it for every other architecture that >> does not share it (except initially for arm64 since it >> will use the variant that returns -EINVAL until feature >> is supported) >> >> 3------------------------------------------------------ >> In kvm_main.c would have something like >> >> void kvm_flush_remote_tlbs(struct kvm *kvm) >> { >> #ifdef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL >> kvm_arch_flush_remote_tlbs(kvm); >> #else >> long dirty_count = kvm->tlbs_dirty; >> >> smp_mb(); >> if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) >> ++kvm->stat.remote_tlb_flush; >> cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); >> #endif >> } >> >> Then add void kvm_flush_remote_tlbs(struct kvm *kvm) definition >> to arm kvm_host.h. Define the function in this case mmu.c >> >> For the dirty log function >> int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, >> struct kvm_dirty_log *log) >> { >> #ifdef CONFIG_HAVE_KVM_ARCH_DIRTY_LOG >> kvm_arch_vm_ioctl_get_dirty_log(kvm, log); >> #else >> int r; >> struct kvm_memory_slot *memslot; >> unsigned long n, i; >> unsigned long *dirty_bitmap; >> unsigned long *dirty_bitmap_buffer; >> bool is_dirty = false; >> ... >> >> But then you have to go into every architecture and define the >> kvm_arch_vm_...() variant. >> >> Is this the right way to go? Or is there a simpler way? >> > Hmmm, I'm really not an expert in the 'established procedures' for what > to put in config files etc., but here's my basic take: > > a) you wouldn't put a config option in Kconfig unless it's comething > that's actually configurable or some generic feature/subsystem that > should only be enabled if hardware has certain capabilities or other > config options enabled. > > b) this seems entirely an implementation issue and not depending on > anything users should select. > > c) therefore, I think it's either a question of always having an > arch-specific implementation that you probe for its return value or you > have some sort of define in the header files for the > arch/X/include/asm/kvm_host.h to control what you need. > > -Christoffer >
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 59565f5..b760f9c 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -232,5 +232,8 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dfd63ac..f06fb21 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -780,11 +780,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -EINVAL; -} - static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e5dff85..907344c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -874,6 +874,85 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) spin_unlock(&kvm->mmu_lock); } +/** + * stage2_wp_mask_range() - write protect memslot pages set in mask + * @pmd - pointer to page table + * @start_ipa - the start range of mask + * @addr - start_ipa or start range of adjusted mask if crossing PMD range + * @mask - mask of dirty pages + * + * Walk mask and write protect the associated dirty pages in the memory region. + * If mask crosses a PMD range adjust it to next page table and return. + */ +static void stage2_wp_mask_range(pmd_t *pmd, phys_addr_t start_ipa, + phys_addr_t *addr, unsigned long *mask) +{ + pte_t *pte; + bool crosses_pmd; + int i = __ffs(*mask); + + if (unlikely(*addr > start_ipa)) + start_ipa = *addr - i * PAGE_SIZE; + pte = pte_offset_kernel(pmd, start_ipa); + for (*addr = start_ipa + i * PAGE_SIZE; *mask; + i = __ffs(*mask), *addr = start_ipa + i * PAGE_SIZE) { + crosses_pmd = !!((start_ipa & PMD_MASK) ^ (*addr & PMD_MASK)); + if (unlikely(crosses_pmd)) { + /* Adjust mask dirty bits relative to next page table */ + *mask >>= (PTRS_PER_PTE - pte_index(start_ipa)); + return; + } + if (!pte_none(pte[i])) + kvm_set_s2pte_readonly(&pte[i]); + *mask &= ~(1 << i); + } +} + +/** + * kvm_mmu_write_protected_pt_masked() - write protect dirty pages set in mask + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gnf_offset' in this memory + * slot to be write protected + * + * Called from dirty page logging read function to write protect bits set in + * mask to record future writes to these pages in dirty page log. This function + * uses simplified page table walk given mask can spawn no more then 2 PMD + * table range. + * 'kvm->mmu_lock' must be held to protect against concurrent modification + * of page tables (2nd stage fault, mmu modifiers, ...) + * + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + pud_t *pud; + pmd_t *pmd; + phys_addr_t start_ipa = (slot->base_gfn + gfn_offset) << PAGE_SHIFT; + phys_addr_t end_ipa = start_ipa + BITS_PER_LONG * PAGE_SIZE; + phys_addr_t addr = start_ipa; + pgd_t *pgdp = kvm->arch.pgd, *pgd; + + do { + pgd = pgdp + pgd_index(addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud) && !pud_huge(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd) && !kvm_pmd_huge(*pmd)) + stage2_wp_mask_range(pmd, start_ipa, + &addr, &mask); + else + addr += PMD_SIZE; + } else + addr += PUD_SIZE; + } else + addr += PGDIR_SIZE; + } while (mask && addr < end_ipa); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long fault_status) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c5582c3..a603ca3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3569,92 +3569,6 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, return 0; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * We need to keep it in mind that VCPU threads can write to the bitmap - * concurrently. So, to avoid losing data, we keep the following order for - * each bit: - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Flush TLB's if needed. - * 4. Copy the snapshot to the userspace. - * - * Between 2 and 3, the guest may write to the page using the remaining TLB - * entry. This is not a problem because the page will be reported dirty at - * step 4 using the snapshot taken before and step 3 ensures that successive - * writes will be logged for the next call. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - int r; - struct kvm_memory_slot *memslot; - unsigned long n, i; - unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_buffer; - bool is_dirty = false; - - mutex_lock(&kvm->slots_lock); - - r = -EINVAL; - if (log->slot >= KVM_USER_MEM_SLOTS) - goto out; - - memslot = id_to_memslot(kvm->memslots, log->slot); - - dirty_bitmap = memslot->dirty_bitmap; - r = -ENOENT; - if (!dirty_bitmap) - goto out; - - n = kvm_dirty_bitmap_bytes(memslot); - - dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); - memset(dirty_bitmap_buffer, 0, n); - - spin_lock(&kvm->mmu_lock); - - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; - - if (!dirty_bitmap[i]) - continue; - - is_dirty = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; - - offset = i * BITS_PER_LONG; - kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); - } - - spin_unlock(&kvm->mmu_lock); - - /* See the comments in kvm_mmu_slot_remove_write_access(). */ - lockdep_assert_held(&kvm->slots_lock); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - if (is_dirty) - kvm_flush_remote_tlbs(kvm); - - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - goto out; - - r = 0; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba25765..c87c612 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -429,6 +429,92 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing data, we keep the following order for + * each bit: + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Flush TLB's if needed. + * 4. Copy the snapshot to the userspace. + * + * Between 2 and 3, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page will be reported dirty at + * step 4 using the snapshot taken before and step 3 ensures that successive + * writes will be logged for the next call. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + int r; + struct kvm_memory_slot *memslot; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; + + mutex_lock(&kvm->slots_lock); + + r = -EINVAL; + if (log->slot >= KVM_USER_MEM_SLOTS) + goto out; + + memslot = id_to_memslot(kvm->memslots, log->slot); + + dirty_bitmap = memslot->dirty_bitmap; + r = -ENOENT; + if (!dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); + + spin_lock(&kvm->mmu_lock); + + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; + + if (!dirty_bitmap[i]) + continue; + + is_dirty = true; + + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); + } + + spin_unlock(&kvm->mmu_lock); + + /* See the comments in kvm_mmu_slot_remove_write_access(). */ + lockdep_assert_held(&kvm->slots_lock); + + /* + * All the TLBs can be flushed out of mmu lock, see the comments in + * kvm_mmu_slot_remove_write_access(). + */ + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ static int kvm_init_mmu_notifier(struct kvm *kvm)
kvm_vm_ioctl_get_dirty_log() is generic used by x86, ARM. x86 recent patch changed this function, this patch picks up those changes, re-tested everything works. Applies cleanly with other patches. This patch adds support for keeping track of VM dirty pages. As dirty page log is retrieved, the pages that have been written are write protected again for next write and log read. Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> --- arch/arm/include/asm/kvm_host.h | 3 ++ arch/arm/kvm/arm.c | 5 --- arch/arm/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 86 --------------------------------------- virt/kvm/kvm_main.c | 86 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 168 insertions(+), 91 deletions(-)