Message ID | 20190121075722.7945-7-peterx@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | userfaultfd: write protection support | expand |
On Mon, Jan 21, 2019 at 03:57:04PM +0800, Peter Xu wrote: > From: Shaohua Li <shli@fb.com> > > Add API to enable/disable writeprotect a vma range. Unlike mprotect, > this doesn't split/merge vmas. > > Cc: Andrea Arcangeli <aarcange@redhat.com> > Cc: Pavel Emelyanov <xemul@parallels.com> > Cc: Rik van Riel <riel@redhat.com> > Cc: Kirill A. Shutemov <kirill@shutemov.name> > Cc: Mel Gorman <mgorman@suse.de> > Cc: Hugh Dickins <hughd@google.com> > Cc: Johannes Weiner <hannes@cmpxchg.org> > Signed-off-by: Shaohua Li <shli@fb.com> > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/linux/userfaultfd_k.h | 2 ++ > mm/userfaultfd.c | 52 +++++++++++++++++++++++++++++++++++ > 2 files changed, 54 insertions(+) > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index 38f748e7186e..e82f3156f4e9 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -37,6 +37,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, > unsigned long dst_start, > unsigned long len, > bool *mmap_changing); > +extern int mwriteprotect_range(struct mm_struct *dst_mm, > + unsigned long start, unsigned long len, bool enable_wp); > > /* mm helpers */ > static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 458acda96f20..c38903f501c7 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -615,3 +615,55 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, > { > return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); > } > + > +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, > + unsigned long len, bool enable_wp) > +{ > + struct vm_area_struct *dst_vma; > + pgprot_t newprot; > + int err; > + > + /* > + * Sanitize the command parameters: > + */ > + BUG_ON(start & ~PAGE_MASK); > + BUG_ON(len & ~PAGE_MASK); > + > + /* Does the address range wrap, or is the span zero-sized? */ > + BUG_ON(start + len <= start); > + > + down_read(&dst_mm->mmap_sem); > + > + /* > + * Make sure the vma is not shared, that the dst range is > + * both valid and fully within a single existing vma. > + */ > + err = -EINVAL; In non-cooperative mode, there can be a race between VM layout changes and mcopy_atomic [1]. I believe the same races are possible here, so can we please make err = -ENOENT for consistency with mcopy? > + dst_vma = find_vma(dst_mm, start); > + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) > + goto out_unlock; > + if (start < dst_vma->vm_start || > + start + len > dst_vma->vm_end) > + goto out_unlock; > + > + if (!dst_vma->vm_userfaultfd_ctx.ctx) > + goto out_unlock; > + if (!userfaultfd_wp(dst_vma)) > + goto out_unlock; > + > + if (!vma_is_anonymous(dst_vma)) > + goto out_unlock; The sanity checks here seem to repeat those in mcopy_atomic(). I'd suggest splitting them out to a helper function. > + if (enable_wp) > + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); > + else > + newprot = vm_get_page_prot(dst_vma->vm_flags); > + > + change_protection(dst_vma, start, start + len, newprot, > + !enable_wp, 0); > + > + err = 0; > +out_unlock: > + up_read(&dst_mm->mmap_sem); > + return err; > +} > -- > 2.17.1 [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=27d02568f529e908399514dfbee8ee43bdfd5299
On Mon, Jan 21, 2019 at 03:57:04PM +0800, Peter Xu wrote: > From: Shaohua Li <shli@fb.com> > > Add API to enable/disable writeprotect a vma range. Unlike mprotect, > this doesn't split/merge vmas. AFAICT it does not do that. > > Cc: Andrea Arcangeli <aarcange@redhat.com> > Cc: Pavel Emelyanov <xemul@parallels.com> > Cc: Rik van Riel <riel@redhat.com> > Cc: Kirill A. Shutemov <kirill@shutemov.name> > Cc: Mel Gorman <mgorman@suse.de> > Cc: Hugh Dickins <hughd@google.com> > Cc: Johannes Weiner <hannes@cmpxchg.org> > Signed-off-by: Shaohua Li <shli@fb.com> > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/linux/userfaultfd_k.h | 2 ++ > mm/userfaultfd.c | 52 +++++++++++++++++++++++++++++++++++ > 2 files changed, 54 insertions(+) > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index 38f748e7186e..e82f3156f4e9 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -37,6 +37,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, > unsigned long dst_start, > unsigned long len, > bool *mmap_changing); > +extern int mwriteprotect_range(struct mm_struct *dst_mm, > + unsigned long start, unsigned long len, bool enable_wp); > > /* mm helpers */ > static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 458acda96f20..c38903f501c7 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -615,3 +615,55 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, > { > return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); > } > + > +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, > + unsigned long len, bool enable_wp) > +{ > + struct vm_area_struct *dst_vma; > + pgprot_t newprot; > + int err; > + > + /* > + * Sanitize the command parameters: > + */ > + BUG_ON(start & ~PAGE_MASK); > + BUG_ON(len & ~PAGE_MASK); > + > + /* Does the address range wrap, or is the span zero-sized? */ > + BUG_ON(start + len <= start); > + > + down_read(&dst_mm->mmap_sem); > + > + /* > + * Make sure the vma is not shared, that the dst range is > + * both valid and fully within a single existing vma. > + */ > + err = -EINVAL; > + dst_vma = find_vma(dst_mm, start); > + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) > + goto out_unlock; > + if (start < dst_vma->vm_start || > + start + len > dst_vma->vm_end) > + goto out_unlock; > + > + if (!dst_vma->vm_userfaultfd_ctx.ctx) > + goto out_unlock; > + if (!userfaultfd_wp(dst_vma)) > + goto out_unlock; > + > + if (!vma_is_anonymous(dst_vma)) > + goto out_unlock; > + > + if (enable_wp) > + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); > + else > + newprot = vm_get_page_prot(dst_vma->vm_flags); > + > + change_protection(dst_vma, start, start + len, newprot, > + !enable_wp, 0); So setting dirty_accountable bring us to that code in mprotect.c: if (dirty_accountable && pte_dirty(ptent) && (pte_soft_dirty(ptent) || !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } My understanding is that you want to set write flag when enable_wp is false and you want to set the write flag unconditionaly, right ? If so then you should really move the change_protection() flags patch before this patch and add a flag for setting pte write flags. Otherwise the above is broken at it will only set the write flag for pte that were dirty and i am guessing so far you always were lucky because pte were all dirty (change_protection will preserve dirtyness) when you write protected them. So i believe the above is broken or at very least unclear if what you really want is to only set write flag to pte that have the dirty flag set. Cheers, Jérôme > + > + err = 0; > +out_unlock: > + up_read(&dst_mm->mmap_sem); > + return err; > +} > -- > 2.17.1 >
On Mon, Jan 21, 2019 at 12:20:35PM +0200, Mike Rapoport wrote: > On Mon, Jan 21, 2019 at 03:57:04PM +0800, Peter Xu wrote: > > From: Shaohua Li <shli@fb.com> > > > > Add API to enable/disable writeprotect a vma range. Unlike mprotect, > > this doesn't split/merge vmas. > > > > Cc: Andrea Arcangeli <aarcange@redhat.com> > > Cc: Pavel Emelyanov <xemul@parallels.com> > > Cc: Rik van Riel <riel@redhat.com> > > Cc: Kirill A. Shutemov <kirill@shutemov.name> > > Cc: Mel Gorman <mgorman@suse.de> > > Cc: Hugh Dickins <hughd@google.com> > > Cc: Johannes Weiner <hannes@cmpxchg.org> > > Signed-off-by: Shaohua Li <shli@fb.com> > > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > include/linux/userfaultfd_k.h | 2 ++ > > mm/userfaultfd.c | 52 +++++++++++++++++++++++++++++++++++ > > 2 files changed, 54 insertions(+) > > > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > > index 38f748e7186e..e82f3156f4e9 100644 > > --- a/include/linux/userfaultfd_k.h > > +++ b/include/linux/userfaultfd_k.h > > @@ -37,6 +37,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, > > unsigned long dst_start, > > unsigned long len, > > bool *mmap_changing); > > +extern int mwriteprotect_range(struct mm_struct *dst_mm, > > + unsigned long start, unsigned long len, bool enable_wp); > > > > /* mm helpers */ > > static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index 458acda96f20..c38903f501c7 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > > @@ -615,3 +615,55 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, > > { > > return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); > > } > > + > > +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, > > + unsigned long len, bool enable_wp) > > +{ > > + struct vm_area_struct *dst_vma; > > + pgprot_t newprot; > > + int err; > > + > > + /* > > + * Sanitize the command parameters: > > + */ > > + BUG_ON(start & ~PAGE_MASK); > > + BUG_ON(len & ~PAGE_MASK); > > + > > + /* Does the address range wrap, or is the span zero-sized? */ > > + BUG_ON(start + len <= start); > > + > > + down_read(&dst_mm->mmap_sem); > > + > > + /* > > + * Make sure the vma is not shared, that the dst range is > > + * both valid and fully within a single existing vma. > > + */ > > + err = -EINVAL; > > In non-cooperative mode, there can be a race between VM layout changes and > mcopy_atomic [1]. I believe the same races are possible here, so can we > please make err = -ENOENT for consistency with mcopy? Sure. > > > + dst_vma = find_vma(dst_mm, start); > > + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) > > + goto out_unlock; > > + if (start < dst_vma->vm_start || > > + start + len > dst_vma->vm_end) > > + goto out_unlock; > > + > > + if (!dst_vma->vm_userfaultfd_ctx.ctx) > > + goto out_unlock; > > + if (!userfaultfd_wp(dst_vma)) > > + goto out_unlock; > > + > > + if (!vma_is_anonymous(dst_vma)) > > + goto out_unlock; > > The sanity checks here seem to repeat those in mcopy_atomic(). I'd suggest > splitting them out to a helper function. It's a good suggestion. Thanks! > > > + if (enable_wp) > > + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); > > + else > > + newprot = vm_get_page_prot(dst_vma->vm_flags); > > + > > + change_protection(dst_vma, start, start + len, newprot, > > + !enable_wp, 0); > > + > > + err = 0; > > +out_unlock: > > + up_read(&dst_mm->mmap_sem); > > + return err; > > +} > > -- > > 2.17.1 > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=27d02568f529e908399514dfbee8ee43bdfd5299 > > -- > Sincerely yours, > Mike. >
On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote: [...] > > + change_protection(dst_vma, start, start + len, newprot, > > + !enable_wp, 0); > > So setting dirty_accountable bring us to that code in mprotect.c: > > if (dirty_accountable && pte_dirty(ptent) && > (pte_soft_dirty(ptent) || > !(vma->vm_flags & VM_SOFTDIRTY))) { > ptent = pte_mkwrite(ptent); > } > > My understanding is that you want to set write flag when enable_wp > is false and you want to set the write flag unconditionaly, right ? Right. > > If so then you should really move the change_protection() flags > patch before this patch and add a flag for setting pte write flags. > > Otherwise the above is broken at it will only set the write flag > for pte that were dirty and i am guessing so far you always were > lucky because pte were all dirty (change_protection will preserve > dirtyness) when you write protected them. > > So i believe the above is broken or at very least unclear if what > you really want is to only set write flag to pte that have the > dirty flag set. You are right, if we build the tree until this patch it won't work for all the cases. It'll only work if the page was at least writable before and also it's dirty (as you explained). Sorry to be unclear about this, maybe I should at least mention that in the commit message but I totally forgot it. All these problems are solved in later on patches, please feel free to have a look at: mm: merge parameters for change_protection() userfaultfd: wp: apply _PAGE_UFFD_WP bit userfaultfd: wp: handle COW properly for uffd-wp Note that even in the follow up patches IMHO we can't directly change the write permission since the page can be shared by other processes (e.g., the zero page or COW pages). But the general idea is the same as you explained. I tried to avoid squashing these stuff altogether as explained previously. Also, this patch can be seen as a standalone patch to introduce the new interface which seems to make sense too, and it is indeed still working in many cases so I see the latter patches as enhancement of this one. Please let me know if you still want me to have all these stuff squashed, or if you'd like me to squash some of them. Thanks!
On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote: > On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote: > > [...] > > > > + change_protection(dst_vma, start, start + len, newprot, > > > + !enable_wp, 0); > > > > So setting dirty_accountable bring us to that code in mprotect.c: > > > > if (dirty_accountable && pte_dirty(ptent) && > > (pte_soft_dirty(ptent) || > > !(vma->vm_flags & VM_SOFTDIRTY))) { > > ptent = pte_mkwrite(ptent); > > } > > > > My understanding is that you want to set write flag when enable_wp > > is false and you want to set the write flag unconditionaly, right ? > > Right. > > > > > If so then you should really move the change_protection() flags > > patch before this patch and add a flag for setting pte write flags. > > > > Otherwise the above is broken at it will only set the write flag > > for pte that were dirty and i am guessing so far you always were > > lucky because pte were all dirty (change_protection will preserve > > dirtyness) when you write protected them. > > > > So i believe the above is broken or at very least unclear if what > > you really want is to only set write flag to pte that have the > > dirty flag set. > > You are right, if we build the tree until this patch it won't work for > all the cases. It'll only work if the page was at least writable > before and also it's dirty (as you explained). Sorry to be unclear > about this, maybe I should at least mention that in the commit message > but I totally forgot it. > > All these problems are solved in later on patches, please feel free to > have a look at: > > mm: merge parameters for change_protection() > userfaultfd: wp: apply _PAGE_UFFD_WP bit > userfaultfd: wp: handle COW properly for uffd-wp > > Note that even in the follow up patches IMHO we can't directly change > the write permission since the page can be shared by other processes > (e.g., the zero page or COW pages). But the general idea is the same > as you explained. > > I tried to avoid squashing these stuff altogether as explained > previously. Also, this patch can be seen as a standalone patch to > introduce the new interface which seems to make sense too, and it is > indeed still working in many cases so I see the latter patches as > enhancement of this one. Please let me know if you still want me to > have all these stuff squashed, or if you'd like me to squash some of > them. Yeah i have look at those after looking at this one. You should just re-order the patch this one first and then one that add new flag, then ones that add the new userfaultfd feature. Otherwise you are adding a userfaultfd feature that is broken midway ie it is added broken and then you fix it. Some one bisecting thing might get hurt by that. It is better to add and change everything you need and then add the new feature so that the new feature will work as intended. So no squashing just change the order ie add the userfaultfd code last. Cheers, Jérôme
On Tue, Jan 22, 2019 at 12:02:24PM -0500, Jerome Glisse wrote: > On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote: > > On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote: > > > > [...] > > > > > > + change_protection(dst_vma, start, start + len, newprot, > > > > + !enable_wp, 0); > > > > > > So setting dirty_accountable bring us to that code in mprotect.c: > > > > > > if (dirty_accountable && pte_dirty(ptent) && > > > (pte_soft_dirty(ptent) || > > > !(vma->vm_flags & VM_SOFTDIRTY))) { > > > ptent = pte_mkwrite(ptent); > > > } > > > > > > My understanding is that you want to set write flag when enable_wp > > > is false and you want to set the write flag unconditionaly, right ? > > > > Right. > > > > > > > > If so then you should really move the change_protection() flags > > > patch before this patch and add a flag for setting pte write flags. > > > > > > Otherwise the above is broken at it will only set the write flag > > > for pte that were dirty and i am guessing so far you always were > > > lucky because pte were all dirty (change_protection will preserve > > > dirtyness) when you write protected them. > > > > > > So i believe the above is broken or at very least unclear if what > > > you really want is to only set write flag to pte that have the > > > dirty flag set. > > > > You are right, if we build the tree until this patch it won't work for > > all the cases. It'll only work if the page was at least writable > > before and also it's dirty (as you explained). Sorry to be unclear > > about this, maybe I should at least mention that in the commit message > > but I totally forgot it. > > > > All these problems are solved in later on patches, please feel free to > > have a look at: > > > > mm: merge parameters for change_protection() > > userfaultfd: wp: apply _PAGE_UFFD_WP bit > > userfaultfd: wp: handle COW properly for uffd-wp > > > > Note that even in the follow up patches IMHO we can't directly change > > the write permission since the page can be shared by other processes > > (e.g., the zero page or COW pages). But the general idea is the same > > as you explained. > > > > I tried to avoid squashing these stuff altogether as explained > > previously. Also, this patch can be seen as a standalone patch to > > introduce the new interface which seems to make sense too, and it is > > indeed still working in many cases so I see the latter patches as > > enhancement of this one. Please let me know if you still want me to > > have all these stuff squashed, or if you'd like me to squash some of > > them. > > Yeah i have look at those after looking at this one. You should just > re-order the patch this one first and then one that add new flag, > then ones that add the new userfaultfd feature. Otherwise you are > adding a userfaultfd feature that is broken midway ie it is added > broken and then you fix it. Some one bisecting thing might get hurt > by that. It is better to add and change everything you need and then > add the new feature so that the new feature will work as intended. > > So no squashing just change the order ie add the userfaultfd code > last. Yes this makes sense, I'll do that in v2. Thanks for the suggestion!
On Wed, Jan 23, 2019 at 10:17:45AM +0800, Peter Xu wrote: > On Tue, Jan 22, 2019 at 12:02:24PM -0500, Jerome Glisse wrote: > > On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote: > > > On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote: > > > > > > [...] > > > > > > > > + change_protection(dst_vma, start, start + len, newprot, > > > > > + !enable_wp, 0); > > > > > > > > So setting dirty_accountable bring us to that code in mprotect.c: > > > > > > > > if (dirty_accountable && pte_dirty(ptent) && > > > > (pte_soft_dirty(ptent) || > > > > !(vma->vm_flags & VM_SOFTDIRTY))) { > > > > ptent = pte_mkwrite(ptent); > > > > } > > > > > > > > My understanding is that you want to set write flag when enable_wp > > > > is false and you want to set the write flag unconditionaly, right ? > > > > > > Right. > > > > > > > > > > > If so then you should really move the change_protection() flags > > > > patch before this patch and add a flag for setting pte write flags. > > > > > > > > Otherwise the above is broken at it will only set the write flag > > > > for pte that were dirty and i am guessing so far you always were > > > > lucky because pte were all dirty (change_protection will preserve > > > > dirtyness) when you write protected them. > > > > > > > > So i believe the above is broken or at very least unclear if what > > > > you really want is to only set write flag to pte that have the > > > > dirty flag set. > > > > > > You are right, if we build the tree until this patch it won't work for > > > all the cases. It'll only work if the page was at least writable > > > before and also it's dirty (as you explained). Sorry to be unclear > > > about this, maybe I should at least mention that in the commit message > > > but I totally forgot it. > > > > > > All these problems are solved in later on patches, please feel free to > > > have a look at: > > > > > > mm: merge parameters for change_protection() > > > userfaultfd: wp: apply _PAGE_UFFD_WP bit > > > userfaultfd: wp: handle COW properly for uffd-wp > > > > > > Note that even in the follow up patches IMHO we can't directly change > > > the write permission since the page can be shared by other processes > > > (e.g., the zero page or COW pages). But the general idea is the same > > > as you explained. > > > > > > I tried to avoid squashing these stuff altogether as explained > > > previously. Also, this patch can be seen as a standalone patch to > > > introduce the new interface which seems to make sense too, and it is > > > indeed still working in many cases so I see the latter patches as > > > enhancement of this one. Please let me know if you still want me to > > > have all these stuff squashed, or if you'd like me to squash some of > > > them. > > > > Yeah i have look at those after looking at this one. You should just > > re-order the patch this one first and then one that add new flag, > > then ones that add the new userfaultfd feature. Otherwise you are > > adding a userfaultfd feature that is broken midway ie it is added > > broken and then you fix it. Some one bisecting thing might get hurt > > by that. It is better to add and change everything you need and then > > add the new feature so that the new feature will work as intended. > > > > So no squashing just change the order ie add the userfaultfd code > > last. > > Yes this makes sense, I'll do that in v2. Thanks for the suggestion! Note before doing a v2 i would really like to see some proof of why you need new page table flag see my reply to: userfaultfd: wp: add WP pagetable tracking to x86 As i believe you can identify COW or KSM from UFD write protect with- out a pte flag. Cheers, Jérôme
On Tue, Jan 22, 2019 at 09:43:38PM -0500, Jerome Glisse wrote: > On Wed, Jan 23, 2019 at 10:17:45AM +0800, Peter Xu wrote: > > On Tue, Jan 22, 2019 at 12:02:24PM -0500, Jerome Glisse wrote: > > > On Tue, Jan 22, 2019 at 05:39:35PM +0800, Peter Xu wrote: > > > > On Mon, Jan 21, 2019 at 09:05:35AM -0500, Jerome Glisse wrote: > > > > > > > > [...] > > > > > > > > > > + change_protection(dst_vma, start, start + len, newprot, > > > > > > + !enable_wp, 0); > > > > > > > > > > So setting dirty_accountable bring us to that code in mprotect.c: > > > > > > > > > > if (dirty_accountable && pte_dirty(ptent) && > > > > > (pte_soft_dirty(ptent) || > > > > > !(vma->vm_flags & VM_SOFTDIRTY))) { > > > > > ptent = pte_mkwrite(ptent); > > > > > } > > > > > > > > > > My understanding is that you want to set write flag when enable_wp > > > > > is false and you want to set the write flag unconditionaly, right ? > > > > > > > > Right. > > > > > > > > > > > > > > If so then you should really move the change_protection() flags > > > > > patch before this patch and add a flag for setting pte write flags. > > > > > > > > > > Otherwise the above is broken at it will only set the write flag > > > > > for pte that were dirty and i am guessing so far you always were > > > > > lucky because pte were all dirty (change_protection will preserve > > > > > dirtyness) when you write protected them. > > > > > > > > > > So i believe the above is broken or at very least unclear if what > > > > > you really want is to only set write flag to pte that have the > > > > > dirty flag set. > > > > > > > > You are right, if we build the tree until this patch it won't work for > > > > all the cases. It'll only work if the page was at least writable > > > > before and also it's dirty (as you explained). Sorry to be unclear > > > > about this, maybe I should at least mention that in the commit message > > > > but I totally forgot it. > > > > > > > > All these problems are solved in later on patches, please feel free to > > > > have a look at: > > > > > > > > mm: merge parameters for change_protection() > > > > userfaultfd: wp: apply _PAGE_UFFD_WP bit > > > > userfaultfd: wp: handle COW properly for uffd-wp > > > > > > > > Note that even in the follow up patches IMHO we can't directly change > > > > the write permission since the page can be shared by other processes > > > > (e.g., the zero page or COW pages). But the general idea is the same > > > > as you explained. > > > > > > > > I tried to avoid squashing these stuff altogether as explained > > > > previously. Also, this patch can be seen as a standalone patch to > > > > introduce the new interface which seems to make sense too, and it is > > > > indeed still working in many cases so I see the latter patches as > > > > enhancement of this one. Please let me know if you still want me to > > > > have all these stuff squashed, or if you'd like me to squash some of > > > > them. > > > > > > Yeah i have look at those after looking at this one. You should just > > > re-order the patch this one first and then one that add new flag, > > > then ones that add the new userfaultfd feature. Otherwise you are > > > adding a userfaultfd feature that is broken midway ie it is added > > > broken and then you fix it. Some one bisecting thing might get hurt > > > by that. It is better to add and change everything you need and then > > > add the new feature so that the new feature will work as intended. > > > > > > So no squashing just change the order ie add the userfaultfd code > > > last. > > > > Yes this makes sense, I'll do that in v2. Thanks for the suggestion! > > Note before doing a v2 i would really like to see some proof of why > you need new page table flag see my reply to: > userfaultfd: wp: add WP pagetable tracking to x86 > > As i believe you can identify COW or KSM from UFD write protect with- > out a pte flag. Yes. I replied in that thread with my understanding on why the new bit is required in the PTE (and also another new bit in the swap entry). We can discuss there. Thanks,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 38f748e7186e..e82f3156f4e9 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -37,6 +37,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern int mwriteprotect_range(struct mm_struct *dst_mm, + unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 458acda96f20..c38903f501c7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -615,3 +615,55 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, { return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); } + +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool enable_wp) +{ + struct vm_area_struct *dst_vma; + pgprot_t newprot; + int err; + + /* + * Sanitize the command parameters: + */ + BUG_ON(start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(start + len <= start); + + down_read(&dst_mm->mmap_sem); + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + err = -EINVAL; + dst_vma = find_vma(dst_mm, start); + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (start < dst_vma->vm_start || + start + len > dst_vma->vm_end) + goto out_unlock; + + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + if (!userfaultfd_wp(dst_vma)) + goto out_unlock; + + if (!vma_is_anonymous(dst_vma)) + goto out_unlock; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + change_protection(dst_vma, start, start + len, newprot, + !enable_wp, 0); + + err = 0; +out_unlock: + up_read(&dst_mm->mmap_sem); + return err; +}