[v4,5/9] s390/mm: Add huge page dirty sync support

Message ID	20180627135510.117945-6-frankja@linux.ibm.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@kernel.org> Gateway: Authorized Use Only! Violators will be prosecuted for <kvm@vger.kernel.org> from <frankja@linux.ibm.com>; Wed, 27 Jun 2018 14:56:12 +0100 Gateway: Authorized Use Only! Violators will be prosecuted; (version=TLSv1/SSLv3 cipher=AES256-GCM-SHA384 bits=256/256) Wed, 27 Jun 2018 14:56:09 +0100 From: Janosch Frank <frankja@linux.ibm.com> To: kvm@vger.kernel.org Cc: david@redhat.com, borntraeger@de.ibm.com, schwidefsky@de.ibm.com, dominik.dingel@gmail.com Subject: [PATCH v4 5/9] s390/mm: Add huge page dirty sync support Date: Wed, 27 Jun 2018 14:55:06 +0100 In-Reply-To: <20180627135510.117945-1-frankja@linux.ibm.com> References: <20180627135510.117945-1-frankja@linux.ibm.com> Message-Id: <20180627135510.117945-6-frankja@linux.ibm.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 4324b2a55aa3..26a31c3b7db6 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -15,6 +15,8 @@ /* Status bits in the gmap segment entry. */ #define _SEGMENT_ENTRY_GMAP_SPLIT 0x0001 /* split huge pmd */ +/* Status bits only for huge segment entries */ +#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* user dirty (migration) */ /** * struct gmap_struct - guest address space @@ -151,4 +153,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, int gmap_mprotect_notify(struct gmap *, unsigned long start, unsigned long len, int prot); +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], + unsigned long gaddr, unsigned long vmaddr); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 242a82efc633..56331d53b357 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1106,6 +1106,10 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *sptep, pte_t *tptep, pte_t pte); void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); +void ptep_remove_dirty_protection_split(struct mm_struct *mm, pte_t *ptep, + unsigned long vmaddr); +bool test_and_clear_guest_dirty_split(struct mm_struct *mm, pmd_t *pmdp, + unsigned long vmaddr); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3b7a5151b6a5..de8dec849b60 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -511,19 +511,23 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) } static void kvm_s390_sync_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot) + struct kvm_memory_slot *memslot) { gfn_t cur_gfn, last_gfn; - unsigned long address; + unsigned long gaddr, vmaddr; + unsigned long *dirty = memslot->dirty_bitmap; struct gmap *gmap = kvm->arch.gmap; - /* Loop over all guest pages */ + /* Loop over all guest segments */ last_gfn = memslot->base_gfn + memslot->npages; - for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) { - address = gfn_to_hva_memslot(memslot, cur_gfn); + for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES, dirty += 4) { + gaddr = gfn_to_gpa(cur_gfn); + vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); + if (kvm_is_error_hva(vmaddr)) + continue; + + gmap_sync_dirty_log_pmd(gmap, dirty, gaddr, vmaddr); - if (test_and_clear_guest_dirty(gmap->mm, address)) - mark_page_dirty(kvm, cur_gfn); if (fatal_signal_pending(current)) return; cond_resched(); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 0d0b2cf7ae8d..3da4f85ec330 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -15,6 +15,7 @@ #include <linux/swapops.h> #include <linux/ksm.h> #include <linux/mman.h> +#include <linux/hugetlb.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -549,6 +550,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pmd_t unprot; + pte_t *ptep; int rc; BUG_ON(gmap_is_shadow(gmap)); @@ -606,12 +609,29 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) vmaddr >> PMD_SHIFT, table); if (!rc) { if (pmd_large(*pmd)) { - *table = pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS_LARGE; + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) + | _SEGMENT_ENTRY_GMAP_UC; } else *table = pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS; } + } else if (*table & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { + unprot = __pmd((*table & (_SEGMENT_ENTRY_HARDWARE_BITS_LARGE + & ~_SEGMENT_ENTRY_PROTECT)) + | _SEGMENT_ENTRY_GMAP_UC); + gmap_pmdp_xchg(gmap, (pmd_t *)table, unprot, gaddr); + } else if (gmap_pmd_is_split((pmd_t *)table)) { + /* + * Split pmds are somewhere in-between a normal and a + * large pmd. As we don't share the page table, the + * host does not remove protection on a fault and we + * have to do it ourselves for the guest mapping. + */ + ptep = pte_offset_map((pmd_t *)table, gaddr); + if (pte_val(*ptep) & _PAGE_PROTECT) + ptep_remove_dirty_protection_split(mm, ptep, vmaddr); } spin_unlock(&gmap->guest_table_lock); spin_unlock(ptl); @@ -989,6 +1009,113 @@ static int gmap_pmd_split(struct gmap *gmap, unsigned long gaddr, pmd_t *pmdp) return 0; } +/** + * gmap_pmdp_force_prot - change access rights of a locked pmd + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @pmdp: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: software bit to set (e.g. for notification) + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. + */ +static int gmap_pmdp_force_prot(struct gmap *gmap, unsigned long addr, + pmd_t *pmdp, int prot) +{ + int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; + int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *pmdp; + + /* Fixup needed */ + if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) + return -EAGAIN; + + if (prot == PROT_NONE && !pmd_i) { + pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + gmap_pmdp_xchg(gmap, pmdp, new, addr); + } + + if (prot == PROT_READ && !pmd_p) { + pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; + pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + gmap_pmdp_xchg(gmap, pmdp, new, addr); + } + return 0; +} + +/** + * gmap_pmdp_transfer_prot - transfer protection of guest pmd to host pmd + * @mm: the memory context + * @address: the affected host virtual address + * @gpmdp: guest pmd ptr + * @hpmdp: host pmd ptr + * + * Transfers the protection from a guest pmd to the associated guest + * pmd. This has to be done with a plain idte to circumvent the gmap + * invalidation hooks in the standard invalidation functions provided + * by pgtable.c. + */ +static void gmap_pmdp_transfer_prot(struct mm_struct *mm, unsigned long addr, + pmd_t *gpmdp, pmd_t *hpmdp) +{ + const int gpmd_i = pmd_val(*gpmdp) & _SEGMENT_ENTRY_INVALID; + const int gpmd_p = pmd_val(*gpmdp) & _SEGMENT_ENTRY_PROTECT; + const int hpmd_i = pmd_val(*hpmdp) & _SEGMENT_ENTRY_INVALID; + const int hpmd_p = pmd_val(*hpmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *hpmdp; + + /* Fastpath, change not needed. */ + if (hpmd_i || (hpmd_p && gpmd_p) || (!gpmd_i && !gpmd_p)) + return; + + if (gpmd_p && !hpmd_p) + pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + if (!gpmd_i && !hpmd_i) + pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; + + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, hpmdp, + IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(addr, hpmdp, 0, 0, + IDTE_GLOBAL); + else + __pmdp_csp(hpmdp); + *hpmdp = new; +} + +/* + * gmap_protect_pmd - set pmd notification bits + * @pmdp: pointer to the pmd to be protected + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EAGAIN if a fixup is needed. + * + * Expected to be called with sg->mm->mmap_sem in read and + * guest_table_lock held. + */ +static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr, pmd_t *pmdp, pmd_t *hpmdp, + int prot) +{ + int ret = 0; + + /* Protect gmap pmd for dirty tracking. */ + ret = gmap_pmdp_force_prot(gmap, gaddr, pmdp, prot); + /* + * Transfer protection back to the host pmd, so userspace has + * never more access rights than the VM. + */ + if (!ret) + gmap_pmdp_transfer_prot(gmap->mm, vmaddr, pmdp, hpmdp); + return ret; +} + + /* * gmap_protect_pte - remove access rights to memory and set pgste bits * @gmap: pointer to guest mapping meta data structure @@ -2477,6 +2604,87 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) } EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); +/** + * gmap_test_and_clear_dirty_segment - test and reset segment dirty status + * @gmap: pointer to guest address space + * @pmdp: pointer to the pmd to be tested + * @gaddr: virtual address in the guest address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +bool gmap_test_and_clear_dirty_segment(struct gmap *gmap, pmd_t *pmdp, + pmd_t *hpmdp, unsigned long gaddr, + unsigned long vmaddr) +{ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return false; + + /* Already protected memory, which did not change is clean */ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) + return false; + + /* Clear UC indication and reset protection */ + pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + gmap_protect_pmd(gmap, gaddr, vmaddr, pmdp, hpmdp, PROT_READ); + return true; +} + +/** + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment + * @gmap: pointer to guest address space + * @bitmap: dirty bitmap for this pmd + * @gaddr: virtual address in the guest address space + * @vmaddr: virtual address in the host address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], + unsigned long gaddr, unsigned long vmaddr) +{ + int i = 0; + pmd_t *pmdp, *hpmdp, fpmd; + spinlock_t *ptl; + + hpmdp = (pmd_t *)huge_pte_offset(gmap->mm, vmaddr, HPAGE_SIZE); + if (!hpmdp) + return; + ptl = pmd_lock(gmap->mm, hpmdp); + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) { + spin_unlock(ptl); + return; + } + + if (pmd_large(*pmdp)) { + if (gmap_test_and_clear_dirty_segment(gmap, pmdp, hpmdp, + gaddr, vmaddr)) + memset(bitmap, 0xff, 32); + } else { + /* We handle this here, as it's of the records from mm. */ + if (unlikely(gmap_pmd_is_split(pmdp))) { + for (; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + if (test_and_clear_guest_dirty_split(gmap->mm, pmdp, vmaddr)) + set_bit_le(i, bitmap); + fpmd = *hpmdp; + pmd_val(fpmd) |= _SEGMENT_ENTRY_PROTECT; + gmap_pmdp_transfer_prot(gmap->mm, vmaddr, + &fpmd, hpmdp); + } + } else { + for (; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + if (test_and_clear_guest_dirty(gmap->mm, vmaddr)) + set_bit_le(i, bitmap); + } + } + } + gmap_pmd_op_end(gmap, pmdp); + spin_unlock(ptl); +} +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); + static inline void thp_split_mm(struct mm_struct *mm) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 7bdb15fc5487..7bc79aae3c25 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -705,6 +705,57 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) preempt_enable(); } +void ptep_remove_dirty_protection_split(struct mm_struct *mm, + pte_t *ptep, unsigned long vmaddr) +{ + pte_t unprot = __pte(pte_val(*ptep) & ~_PAGE_PROTECT); + pgste_t pgste; + unsigned long bits; + + pgste = pgste_get_lock(ptep); + pgste_val(pgste) |= PGSTE_UC_BIT; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + pgste_val(pgste) ^= bits; + ptep_notify_gmap(mm, vmaddr, ptep, bits); + ptep_ipte_global(mm, vmaddr, ptep, 0); + + *ptep = unprot; + pgste_set_unlock(ptep, pgste); +} +EXPORT_SYMBOL_GPL(ptep_remove_dirty_protection_split); + +bool test_and_clear_guest_dirty_split(struct mm_struct *mm, pmd_t *pmdp, + unsigned long vmaddr) +{ + bool dirty; + pte_t *ptep, pte; + pgste_t pgste; + unsigned long bits; + + ptep = pte_offset_map(pmdp, vmaddr); + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; + pte = *ptep; + if (dirty) { + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste_val(pgste) ^= bits; + ptep_notify_gmap(mm, vmaddr, ptep, bits); + } + ptep_ipte_global(mm, vmaddr, ptep, 0); + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + pte_val(pte) |= _PAGE_PROTECT; + else + pte_val(pte) |= _PAGE_INVALID; + *ptep = pte; + } + pgste_set_unlock(ptep, pgste); + return dirty; +} +EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty_split); + /* * Test and reset if a guest page is dirty */ @@ -731,12 +782,6 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pmd = pmd_alloc(mm, pud, addr); if (!pmd) return false; - /* We can't run guests backed by huge pages, but userspace can - * still set them up and then try to migrate them without any - * migration support. - */ - if (pmd_large(*pmd)) - return true; ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (unlikely(!ptep))

[v4,5/9] s390/mm: Add huge page dirty sync support

Commit Message

Comments

Patch