Message ID | 20240903232241.43995-9-anthony.yznaga@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for shared PTEs across processes | expand |
On Tue, Sep 03, 2024 at 04:22:39PM -0700, Anthony Yznaga wrote: > From: Khalid Aziz <khalid@kernel.org> > > Add support for handling page faults in an mshare region by > redirecting the faults to operate on the mshare mm_struct and > vmas contained in it and to link the page tables of the faulting > process with the shared page tables in the mshare mm. > Modify the unmap path to ensure that page tables in mshare regions > are kept intact when a process exits. Note that they are also > kept intact and not unlinked from a process when an mshare region > is explicitly unmapped which is bug to be addressed. > > Signed-off-by: Khalid Aziz <khalid@kernel.org> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com> > --- > mm/internal.h | 1 + > mm/memory.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++--- > mm/mshare.c | 38 +++++++++++++++++++++++++++++++ > 3 files changed, 98 insertions(+), 3 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index 8005d5956b6e..8ac224d96806 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -1578,6 +1578,7 @@ void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); > void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); > void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); > > +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, unsigned long *addrp); > static inline bool vma_is_shared(const struct vm_area_struct *vma) > { > return VM_SHARED_PT && (vma->vm_flags & VM_SHARED_PT); > diff --git a/mm/memory.c b/mm/memory.c > index 3c01d68065be..f526aef71a61 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -387,11 +387,15 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, > vma_start_write(vma); > unlink_anon_vmas(vma); > > + /* > + * There is no page table to be freed for vmas that > + * are mapped in mshare regions > + */ > if (is_vm_hugetlb_page(vma)) { > unlink_file_vma(vma); > hugetlb_free_pgd_range(tlb, addr, vma->vm_end, > floor, next ? next->vm_start : ceiling); > - } else { > + } else if (!vma_is_shared(vma)) { > unlink_file_vma_batch_init(&vb); > unlink_file_vma_batch_add(&vb, vma); > > @@ -399,7 +403,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, > * Optimization: gather nearby vmas into one call down > */ > while (next && next->vm_start <= vma->vm_end + PMD_SIZE > - && !is_vm_hugetlb_page(next)) { > + && !is_vm_hugetlb_page(next) > + && !vma_is_shared(next)) { > vma = next; > next = mas_find(mas, ceiling - 1); > if (unlikely(xa_is_zero(next))) > @@ -412,7 +417,9 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, > unlink_file_vma_batch_final(&vb); > free_pgd_range(tlb, addr, vma->vm_end, > floor, next ? next->vm_start : ceiling); > - } > + } else > + unlink_file_vma(vma); > + > vma = next; I would rather have vma->vm_ops->free_pgtables() hook that would be defined to non-NULL for mshared and hugetlb VMAs > } while (vma); > } > @@ -1797,6 +1804,13 @@ void unmap_page_range(struct mmu_gather *tlb, > pgd_t *pgd; > unsigned long next; > > + /* > + * No need to unmap vmas that share page table through > + * mshare region > + */ > + if (vma_is_shared(vma)) > + return; > + Ditto. vma->vm_ops->unmap_page_range(). > BUG_ON(addr >= end); > tlb_start_vma(tlb, vma); > pgd = pgd_offset(vma->vm_mm, addr); > @@ -5801,6 +5815,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > struct mm_struct *mm = vma->vm_mm; > vm_fault_t ret; > bool is_droppable; > + bool shared = false; > > __set_current_state(TASK_RUNNING); > > @@ -5808,6 +5823,21 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > if (ret) > goto out; > > + if (unlikely(vma_is_shared(vma))) { > + /* XXX mshare does not support per-VMA locks yet so fallback to mm lock */ > + if (flags & FAULT_FLAG_VMA_LOCK) { > + vma_end_read(vma); > + return VM_FAULT_RETRY; > + } > + > + ret = find_shared_vma(&vma, &address); > + if (ret) > + return ret; > + if (!vma) > + return VM_FAULT_SIGSEGV; > + shared = true; Do we need to update 'mm' variable here? It is going to be used to account the fault below. Not sure which mm has to account such faults. > + } > + > if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, > flags & FAULT_FLAG_INSTRUCTION, > flags & FAULT_FLAG_REMOTE)) { > @@ -5843,6 +5873,32 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > if (is_droppable) > ret &= ~VM_FAULT_OOM; > > + /* > + * Release the read lock on the shared mm of a shared VMA unless > + * unless the lock has already been released. > + * The mmap lock will already have been released if __handle_mm_fault > + * returns VM_FAULT_COMPLETED or if it returns VM_FAULT_RETRY and > + * the flags FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT are > + * _not_ both set. > + * If the lock was released earlier, release the lock on the task's > + * mm now to keep lock state consistent. > + */ > + if (shared) { > + int release_mmlock = 1; > + > + if ((ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) == 0) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && > + (flags & FAULT_FLAG_RETRY_NOWAIT)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } > + > + if (release_mmlock) > + mmap_read_unlock(mm); > + } > + > if (flags & FAULT_FLAG_USER) { > mem_cgroup_exit_user_fault(); > /* > diff --git a/mm/mshare.c b/mm/mshare.c > index f3f6ed9c3761..8f47c8d6e6a4 100644 > --- a/mm/mshare.c > +++ b/mm/mshare.c > @@ -19,6 +19,7 @@ > #include <linux/spinlock_types.h> > #include <uapi/linux/magic.h> > #include <uapi/linux/msharefs.h> > +#include "internal.h" > > struct mshare_data { > struct mm_struct *mm; > @@ -33,6 +34,43 @@ struct msharefs_info { > static const struct inode_operations msharefs_dir_inode_ops; > static const struct inode_operations msharefs_file_inode_ops; > > +/* Returns holding the host mm's lock for read. Caller must release. */ > +vm_fault_t > +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) > +{ > + struct vm_area_struct *vma, *guest = *vmap; > + struct mshare_data *m_data = guest->vm_private_data; > + struct mm_struct *host_mm = m_data->mm; > + unsigned long host_addr; > + pgd_t *pgd, *guest_pgd; > + > + mmap_read_lock(host_mm); Hm. So we have current->mm locked here, right? So this is nested mmap lock. Have you tested it under lockdep? I expected it to complain. > + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; > + pgd = pgd_offset(host_mm, host_addr); > + guest_pgd = pgd_offset(guest->vm_mm, *addrp); > + if (!pgd_same(*guest_pgd, *pgd)) { > + set_pgd(guest_pgd, *pgd); > + mmap_read_unlock(host_mm); > + return VM_FAULT_NOPAGE; > + } > + > + *addrp = host_addr; > + vma = find_vma(host_mm, host_addr); > + > + /* XXX: expand stack? */ > + if (vma && vma->vm_start > host_addr) > + vma = NULL; > + > + *vmap = vma; > + > + /* > + * release host mm lock unless a matching vma is found > + */ > + if (!vma) > + mmap_read_unlock(host_mm); > + return 0; > +} > + > /* > * Disallow partial unmaps of an mshare region for now. Unmapping at > * boundaries aligned to the level page tables are shared at could > -- > 2.43.5 >
On 10/7/24 1:41 AM, Kirill A. Shutemov wrote: > On Tue, Sep 03, 2024 at 04:22:39PM -0700, Anthony Yznaga wrote: >> From: Khalid Aziz <khalid@kernel.org> >> >> Add support for handling page faults in an mshare region by >> redirecting the faults to operate on the mshare mm_struct and >> vmas contained in it and to link the page tables of the faulting >> process with the shared page tables in the mshare mm. >> Modify the unmap path to ensure that page tables in mshare regions >> are kept intact when a process exits. Note that they are also >> kept intact and not unlinked from a process when an mshare region >> is explicitly unmapped which is bug to be addressed. >> >> Signed-off-by: Khalid Aziz <khalid@kernel.org> >> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> >> Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com> >> --- >> mm/internal.h | 1 + >> mm/memory.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++--- >> mm/mshare.c | 38 +++++++++++++++++++++++++++++++ >> 3 files changed, 98 insertions(+), 3 deletions(-) >> >> diff --git a/mm/internal.h b/mm/internal.h >> index 8005d5956b6e..8ac224d96806 100644 >> --- a/mm/internal.h >> +++ b/mm/internal.h >> @@ -1578,6 +1578,7 @@ void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); >> void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); >> void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); >> >> +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, unsigned long *addrp); >> static inline bool vma_is_shared(const struct vm_area_struct *vma) >> { >> return VM_SHARED_PT && (vma->vm_flags & VM_SHARED_PT); >> diff --git a/mm/memory.c b/mm/memory.c >> index 3c01d68065be..f526aef71a61 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -387,11 +387,15 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, >> vma_start_write(vma); >> unlink_anon_vmas(vma); >> >> + /* >> + * There is no page table to be freed for vmas that >> + * are mapped in mshare regions >> + */ >> if (is_vm_hugetlb_page(vma)) { >> unlink_file_vma(vma); >> hugetlb_free_pgd_range(tlb, addr, vma->vm_end, >> floor, next ? next->vm_start : ceiling); >> - } else { >> + } else if (!vma_is_shared(vma)) { >> unlink_file_vma_batch_init(&vb); >> unlink_file_vma_batch_add(&vb, vma); >> >> @@ -399,7 +403,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, >> * Optimization: gather nearby vmas into one call down >> */ >> while (next && next->vm_start <= vma->vm_end + PMD_SIZE >> - && !is_vm_hugetlb_page(next)) { >> + && !is_vm_hugetlb_page(next) >> + && !vma_is_shared(next)) { >> vma = next; >> next = mas_find(mas, ceiling - 1); >> if (unlikely(xa_is_zero(next))) >> @@ -412,7 +417,9 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, >> unlink_file_vma_batch_final(&vb); >> free_pgd_range(tlb, addr, vma->vm_end, >> floor, next ? next->vm_start : ceiling); >> - } >> + } else >> + unlink_file_vma(vma); >> + >> vma = next; > I would rather have vma->vm_ops->free_pgtables() hook that would be defined > to non-NULL for mshared and hugetlb VMAs That's a good idea. I'll do that. > >> } while (vma); >> } >> @@ -1797,6 +1804,13 @@ void unmap_page_range(struct mmu_gather *tlb, >> pgd_t *pgd; >> unsigned long next; >> >> + /* >> + * No need to unmap vmas that share page table through >> + * mshare region >> + */ >> + if (vma_is_shared(vma)) >> + return; >> + > Ditto. vma->vm_ops->unmap_page_range(). Okay, I can do that here, too. > >> BUG_ON(addr >= end); >> tlb_start_vma(tlb, vma); >> pgd = pgd_offset(vma->vm_mm, addr); >> @@ -5801,6 +5815,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> struct mm_struct *mm = vma->vm_mm; >> vm_fault_t ret; >> bool is_droppable; >> + bool shared = false; >> >> __set_current_state(TASK_RUNNING); >> >> @@ -5808,6 +5823,21 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> if (ret) >> goto out; >> >> + if (unlikely(vma_is_shared(vma))) { >> + /* XXX mshare does not support per-VMA locks yet so fallback to mm lock */ >> + if (flags & FAULT_FLAG_VMA_LOCK) { >> + vma_end_read(vma); >> + return VM_FAULT_RETRY; >> + } >> + >> + ret = find_shared_vma(&vma, &address); >> + if (ret) >> + return ret; >> + if (!vma) >> + return VM_FAULT_SIGSEGV; >> + shared = true; > Do we need to update 'mm' variable here? > > It is going to be used to account the fault below. Not sure which mm has > to account such faults. The accounting won't work right for memcg accounting, and there's a bug here. The mshare mm is allocated via mm_alloc() which will initialize mm->owner to current. As long as that task is around, count_memcg_event_mm() will go through it to get a memcg to account to. But if the task has exited, the mshare mm owner is no longer valid but will still be used. I will just clear the owner for now. And a quick note on calling find_shared_vma() here. I found I needed to move the call earlier to do_user_addr_fault() because there are permission checks there that check vma flags, and they need to be done against the vmas in the mshare mm. > >> + } >> + >> if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, >> flags & FAULT_FLAG_INSTRUCTION, >> flags & FAULT_FLAG_REMOTE)) { >> @@ -5843,6 +5873,32 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> if (is_droppable) >> ret &= ~VM_FAULT_OOM; >> >> + /* >> + * Release the read lock on the shared mm of a shared VMA unless >> + * unless the lock has already been released. >> + * The mmap lock will already have been released if __handle_mm_fault >> + * returns VM_FAULT_COMPLETED or if it returns VM_FAULT_RETRY and >> + * the flags FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT are >> + * _not_ both set. >> + * If the lock was released earlier, release the lock on the task's >> + * mm now to keep lock state consistent. >> + */ >> + if (shared) { >> + int release_mmlock = 1; >> + >> + if ((ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) == 0) { >> + mmap_read_unlock(vma->vm_mm); >> + release_mmlock = 0; >> + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && >> + (flags & FAULT_FLAG_RETRY_NOWAIT)) { >> + mmap_read_unlock(vma->vm_mm); >> + release_mmlock = 0; >> + } >> + >> + if (release_mmlock) >> + mmap_read_unlock(mm); >> + } >> + >> if (flags & FAULT_FLAG_USER) { >> mem_cgroup_exit_user_fault(); >> /* >> diff --git a/mm/mshare.c b/mm/mshare.c >> index f3f6ed9c3761..8f47c8d6e6a4 100644 >> --- a/mm/mshare.c >> +++ b/mm/mshare.c >> @@ -19,6 +19,7 @@ >> #include <linux/spinlock_types.h> >> #include <uapi/linux/magic.h> >> #include <uapi/linux/msharefs.h> >> +#include "internal.h" >> >> struct mshare_data { >> struct mm_struct *mm; >> @@ -33,6 +34,43 @@ struct msharefs_info { >> static const struct inode_operations msharefs_dir_inode_ops; >> static const struct inode_operations msharefs_file_inode_ops; >> >> +/* Returns holding the host mm's lock for read. Caller must release. */ >> +vm_fault_t >> +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) >> +{ >> + struct vm_area_struct *vma, *guest = *vmap; >> + struct mshare_data *m_data = guest->vm_private_data; >> + struct mm_struct *host_mm = m_data->mm; >> + unsigned long host_addr; >> + pgd_t *pgd, *guest_pgd; >> + >> + mmap_read_lock(host_mm); > Hm. So we have current->mm locked here, right? So this is nested mmap > lock. Have you tested it under lockdep? I expected it to complain. Yes, it complains. I have patches to introduce and use mmap_read_lock_nested(). Thanks you for the feedback. Anthony > >> + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; >> + pgd = pgd_offset(host_mm, host_addr); >> + guest_pgd = pgd_offset(guest->vm_mm, *addrp); >> + if (!pgd_same(*guest_pgd, *pgd)) { >> + set_pgd(guest_pgd, *pgd); >> + mmap_read_unlock(host_mm); >> + return VM_FAULT_NOPAGE; >> + } >> + >> + *addrp = host_addr; >> + vma = find_vma(host_mm, host_addr); >> + >> + /* XXX: expand stack? */ >> + if (vma && vma->vm_start > host_addr) >> + vma = NULL; >> + >> + *vmap = vma; >> + >> + /* >> + * release host mm lock unless a matching vma is found >> + */ >> + if (!vma) >> + mmap_read_unlock(host_mm); >> + return 0; >> +} >> + >> /* >> * Disallow partial unmaps of an mshare region for now. Unmapping at >> * boundaries aligned to the level page tables are shared at could >> -- >> 2.43.5 >>
diff --git a/mm/internal.h b/mm/internal.h index 8005d5956b6e..8ac224d96806 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1578,6 +1578,7 @@ void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, unsigned long *addrp); static inline bool vma_is_shared(const struct vm_area_struct *vma) { return VM_SHARED_PT && (vma->vm_flags & VM_SHARED_PT); diff --git a/mm/memory.c b/mm/memory.c index 3c01d68065be..f526aef71a61 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -387,11 +387,15 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, vma_start_write(vma); unlink_anon_vmas(vma); + /* + * There is no page table to be freed for vmas that + * are mapped in mshare regions + */ if (is_vm_hugetlb_page(vma)) { unlink_file_vma(vma); hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); - } else { + } else if (!vma_is_shared(vma)) { unlink_file_vma_batch_init(&vb); unlink_file_vma_batch_add(&vb, vma); @@ -399,7 +403,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { + && !is_vm_hugetlb_page(next) + && !vma_is_shared(next)) { vma = next; next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) @@ -412,7 +417,9 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, unlink_file_vma_batch_final(&vb); free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); - } + } else + unlink_file_vma(vma); + vma = next; } while (vma); } @@ -1797,6 +1804,13 @@ void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; + /* + * No need to unmap vmas that share page table through + * mshare region + */ + if (vma_is_shared(vma)) + return; + BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); @@ -5801,6 +5815,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; bool is_droppable; + bool shared = false; __set_current_state(TASK_RUNNING); @@ -5808,6 +5823,21 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (ret) goto out; + if (unlikely(vma_is_shared(vma))) { + /* XXX mshare does not support per-VMA locks yet so fallback to mm lock */ + if (flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + + ret = find_shared_vma(&vma, &address); + if (ret) + return ret; + if (!vma) + return VM_FAULT_SIGSEGV; + shared = true; + } + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) { @@ -5843,6 +5873,32 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (is_droppable) ret &= ~VM_FAULT_OOM; + /* + * Release the read lock on the shared mm of a shared VMA unless + * unless the lock has already been released. + * The mmap lock will already have been released if __handle_mm_fault + * returns VM_FAULT_COMPLETED or if it returns VM_FAULT_RETRY and + * the flags FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT are + * _not_ both set. + * If the lock was released earlier, release the lock on the task's + * mm now to keep lock state consistent. + */ + if (shared) { + int release_mmlock = 1; + + if ((ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) == 0) { + mmap_read_unlock(vma->vm_mm); + release_mmlock = 0; + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + mmap_read_unlock(vma->vm_mm); + release_mmlock = 0; + } + + if (release_mmlock) + mmap_read_unlock(mm); + } + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/mshare.c b/mm/mshare.c index f3f6ed9c3761..8f47c8d6e6a4 100644 --- a/mm/mshare.c +++ b/mm/mshare.c @@ -19,6 +19,7 @@ #include <linux/spinlock_types.h> #include <uapi/linux/magic.h> #include <uapi/linux/msharefs.h> +#include "internal.h" struct mshare_data { struct mm_struct *mm; @@ -33,6 +34,43 @@ struct msharefs_info { static const struct inode_operations msharefs_dir_inode_ops; static const struct inode_operations msharefs_file_inode_ops; +/* Returns holding the host mm's lock for read. Caller must release. */ +vm_fault_t +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) +{ + struct vm_area_struct *vma, *guest = *vmap; + struct mshare_data *m_data = guest->vm_private_data; + struct mm_struct *host_mm = m_data->mm; + unsigned long host_addr; + pgd_t *pgd, *guest_pgd; + + mmap_read_lock(host_mm); + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; + pgd = pgd_offset(host_mm, host_addr); + guest_pgd = pgd_offset(guest->vm_mm, *addrp); + if (!pgd_same(*guest_pgd, *pgd)) { + set_pgd(guest_pgd, *pgd); + mmap_read_unlock(host_mm); + return VM_FAULT_NOPAGE; + } + + *addrp = host_addr; + vma = find_vma(host_mm, host_addr); + + /* XXX: expand stack? */ + if (vma && vma->vm_start > host_addr) + vma = NULL; + + *vmap = vma; + + /* + * release host mm lock unless a matching vma is found + */ + if (!vma) + mmap_read_unlock(host_mm); + return 0; +} + /* * Disallow partial unmaps of an mshare region for now. Unmapping at * boundaries aligned to the level page tables are shared at could