@@ -5287,6 +5287,7 @@ T: git git://people.freedesktop.org/~thomash/linux
S: Supported
F: drivers/gpu/drm/vmwgfx/
F: include/uapi/drm/vmwgfx_drm.h
+F: mm/as_dirty_helpers.c
DRM DRIVERS
M: David Airlie <airlied@linux.ie>
@@ -2657,7 +2657,6 @@ typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
-
#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
extern void kernel_poison_pages(struct page *page, int numpages, int enable);
@@ -2891,5 +2890,17 @@ void __init setup_nr_node_ids(void);
static inline void setup_nr_node_ids(void) {}
#endif
+#ifdef CONFIG_AS_DIRTY_HELPERS
+unsigned long apply_as_clean(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr,
+ pgoff_t bitmap_pgoff,
+ unsigned long *bitmap,
+ pgoff_t *start,
+ pgoff_t *end);
+
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr);
+#endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
@@ -736,4 +736,7 @@ config ARCH_HAS_PTE_SPECIAL
config ARCH_HAS_HUGEPD
bool
+config AS_DIRTY_HELPERS
+ bool
+
endmenu
@@ -105,3 +105,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o
new file mode 100644
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/hugetlb.h>
+#include <linux/bitops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct as_walk - Argument to struct as_walk_ops callbacks.
+ * @vma: Pointer to the struct vmw_area_struct currently being walked.
+ *
+ * Embeddable argument to struct as_walk_ops callbacks.
+ */
+struct as_walk {
+ struct vm_area_struct *vma;
+};
+
+/**
+ * struct as_walk_ops - Callbacks for entries of various page table levels.
+ * extend for additional level support.
+ */
+struct as_walk_ops {
+ /**
+ * pte-entry: Callback for PTEs
+ * @pte: Pointer to the PTE.
+ * @addr: Virtual address.
+ * @asw: Struct as_walk argument for the walk. Embed for additional
+ * data.
+ */
+ void (*const pte_entry) (pte_t *pte, unsigned long addr,
+ struct as_walk *asw);
+};
+
+/* Page-walking code */
+static void walk_as_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ const struct as_walk_ops *ops,
+ struct as_walk *asw)
+{
+ struct mm_struct *mm = asw->vma->vm_mm;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = (mm == &init_mm) ?
+ pte_offset_kernel(pmd, addr) :
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+
+ arch_enter_lazy_mmu_mode();
+
+ do {
+ ops->pte_entry(pte++, addr, asw);
+ } while (addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
+
+ if (mm != &init_mm)
+ pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void walk_as_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ const struct as_walk_ops *ops,
+ struct as_walk *asw)
+{
+ pmd_t *pmd = pmd_offset(pud, addr);
+ unsigned long next;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ if (WARN_ON(pmd_huge(*pmd)))
+ continue;
+ walk_as_pte_range(pmd, addr, next, ops, asw);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void walk_as_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ const struct as_walk_ops *ops,
+ struct as_walk *asw)
+{
+ pud_t *pud = pud_offset(p4d, addr);
+ unsigned long next;
+
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ if (WARN_ON(pud_huge(*pud)))
+ continue;
+ walk_as_pmd_range(pud, addr, next, ops, asw);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void walk_as_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ const struct as_walk_ops *ops,
+ struct as_walk *asw)
+{
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ unsigned long next;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ walk_as_pud_range(p4d, addr, next, ops, asw);
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void walk_as_pfn_range(unsigned long addr, unsigned long end,
+ const struct as_walk_ops *ops,
+ struct as_walk *asw)
+{
+ pgd_t *pgd = pgd_offset(asw->vma->vm_mm, addr);
+ unsigned long next;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ walk_as_p4d_range(pgd, addr, next, ops, asw);
+ } while (pgd++, addr = next, addr != end);
+}
+
+
+/**
+ * struct as_walk_range - Argument for apply_as_range
+ * @asw: The struct as_walk we embed for the page walk
+ * @start: Address of first modified pte
+ * @end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ */
+struct as_walk_range {
+ struct as_walk base;
+ unsigned long start;
+ unsigned long end;
+ unsigned long total;
+};
+
+#define to_as_walk_range(_asw) container_of(_asw, struct as_walk_range, base)
+
+/**
+ * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
+ * @pte: Pointer to the pte
+ * @addr: The virtual page address
+ * @asw: Pointer to a struct as_walk embedded in a struct as_walk_range
+ *
+ * The function write-protects a pte and records the range in
+ * virtual address space of touched ptes for efficient range TLB flushes.
+ */
+static void apply_pt_wrprotect(pte_t *pte, unsigned long addr,
+ struct as_walk *asw)
+{
+ struct as_walk_range *awr = to_as_walk_range(asw);
+ pte_t ptent = *pte;
+
+ if (pte_write(ptent)) {
+ pte_t old_pte = ptep_modify_prot_start(asw->vma, addr, pte);
+
+ ptent = pte_wrprotect(old_pte);
+ ptep_modify_prot_commit(asw->vma, addr, pte, old_pte, ptent);
+ awr->total++;
+ awr->start = min(awr->start, addr);
+ awr->end = max(awr->end, addr + PAGE_SIZE);
+ }
+}
+
+/**
+ * struct as_walk_clean - Argument structure for apply_pt_clean
+ * @base: struct as_walk we derive from
+ * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
+ * @bitmap: Bitmap with one bit for each page offset in the address_space range
+ * covered.
+ * @start: Address_space page offset of first modified pte relative
+ * to @bitmap_pgoff
+ * @end: Address_space page offset of last modified pte relative
+ * to @bitmap_pgoff
+ */
+struct as_walk_clean {
+ struct as_walk_range base;
+ pgoff_t bitmap_pgoff;
+ unsigned long *bitmap;
+ pgoff_t start;
+ pgoff_t end;
+};
+
+#define to_as_walk_clean(_awr) container_of(_awr, struct as_walk_clean, base)
+
+/**
+ * apply_pt_clean - Leaf pte callback to clean a pte
+ * @pte: Pointer to the pte
+ * @addr: The virtual page address
+ * @asw: Pointer to a struct as_walk embedded in a struct as_walk_clean
+ *
+ * The function cleans a pte and records the range in
+ * virtual address space of touched ptes for efficient TLB flushes.
+ * It also records dirty ptes in a bitmap representing page offsets
+ * in the address_space, as well as the first and last of the bits
+ * touched.
+ */
+static void apply_pt_clean(pte_t *pte, unsigned long addr, struct as_walk *asw)
+{
+ struct as_walk_range *awr = to_as_walk_range(asw);
+ struct as_walk_clean *clean = to_as_walk_clean(awr);
+ pte_t ptent = *pte;
+
+ if (pte_dirty(ptent)) {
+ pgoff_t pgoff = ((addr - asw->vma->vm_start) >> PAGE_SHIFT) +
+ asw->vma->vm_pgoff - clean->bitmap_pgoff;
+ pte_t old_pte = ptep_modify_prot_start(asw->vma, addr, pte);
+
+ ptent = pte_mkclean(old_pte);
+ ptep_modify_prot_commit(asw->vma, addr, pte, old_pte, ptent);
+
+ awr->total++;
+ awr->start = min(awr->start, addr);
+ awr->end = max(awr->end, addr + PAGE_SIZE);
+
+ __set_bit(pgoff, clean->bitmap);
+ clean->start = min(clean->start, pgoff);
+ clean->end = max(clean->end, pgoff + 1);
+ }
+}
+
+/**
+ * apply_as_range - Apply a pte callback to all PTEs pointing into a range
+ * of an address_space.
+ * @mapping: Pointer to the struct address_space
+ * @aas: Closure structure
+ * @first_index: First page offset in the address_space
+ * @nr: Number of incremental page offsets to cover
+ *
+ * Return: Number of ptes touched. Note that this number might be larger
+ * than @nr if there are overlapping vmas
+ */
+static unsigned long apply_as_range(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr,
+ const struct as_walk_ops *ops,
+ struct as_walk_range *awr)
+{
+ struct vm_area_struct *vma;
+ pgoff_t vba, vea, cba, cea;
+ unsigned long start_addr, end_addr;
+ struct mmu_notifier_range range;
+
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+ first_index + nr - 1) {
+ unsigned long vm_flags = READ_ONCE(vma->vm_flags);
+
+ /*
+ * We can only do advisory flag tests below, since we can't
+ * require the mm's mmap_sem to be held to protect the flags.
+ * Therefore, callers that strictly depend on specific vm_flags
+ * to remain constant throughout the operation must ensure
+ * those flags are immutable for all relevant vmas or can't use
+ * this function. Fixing this properly would require the
+ * vm_flags to be protected by a separate lock taken after the
+ * i_mmap_lock
+ */
+
+ /* Skip non-applicable VMAs */
+ if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
+ (VM_SHARED | VM_WRITE))
+ continue;
+
+ /* Warn on and skip VMAs whose flags indicate illegal usage */
+ if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
+ continue;
+
+ /* Clip to the vma */
+ vba = vma->vm_pgoff;
+ vea = vba + vma_pages(vma);
+ cba = first_index;
+ cba = max(cba, vba);
+ cea = first_index + nr;
+ cea = min(cea, vea);
+
+ /* Translate to virtual address */
+ start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
+ end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
+ if (start_addr >= end_addr)
+ continue;
+
+ awr->base.vma = vma;
+ awr->start = end_addr;
+ awr->end = start_addr;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+ vma, vma->vm_mm, start_addr, end_addr);
+ mmu_notifier_invalidate_range_start(&range);
+
+ /* Is this needed when we only change protection? */
+ flush_cache_range(vma, start_addr, end_addr);
+
+ /*
+ * We're not using tlb_gather_mmu() since typically
+ * only a small subrange of PTEs are affected, whereas
+ * tlb_gather_mmu() records the full range.
+ */
+ inc_tlb_flush_pending(vma->vm_mm);
+ walk_as_pfn_range(start_addr, end_addr, ops, &awr->base);
+ if (awr->end > awr->start)
+ flush_tlb_range(vma, awr->start, awr->end);
+
+ mmu_notifier_invalidate_range_end(&range);
+ dec_tlb_flush_pending(vma->vm_mm);
+ }
+ i_mmap_unlock_read(mapping);
+
+ return awr->total;
+}
+
+/**
+ * apply_as_wrprotect - Write-protect all ptes in an address_space range
+ * @mapping: The address_space we want to write protect
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of ptes actually write-protected. Note that
+ * already write-protected ptes are not counted.
+ */
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr)
+{
+ static const struct as_walk_ops ops = {
+ .pte_entry = apply_pt_wrprotect
+ };
+ struct as_walk_range awr = { .total = 0 };
+
+ return apply_as_range(mapping, first_index, nr, &ops, &awr);
+}
+EXPORT_SYMBOL_GPL(apply_as_wrprotect);
+
+/**
+ * apply_as_clean - Clean all ptes in an address_space range
+ * @mapping: The address_space we want to clean
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ * @bitmap_pgoff: The page offset of the first bit in @bitmap
+ * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
+ * cover the whole range @first_index..@first_index + @nr.
+ * @start: Pointer to number of the first set bit in @bitmap.
+ * is modified as new bits are set by the function.
+ * @end: Pointer to the number of the last set bit in @bitmap.
+ * none set. The value is modified as new bits are set by the function.
+ *
+ * Note: When this function returns there is no guarantee that a CPU has
+ * not already dirtied new ptes. However it will not clean any ptes not
+ * reported in the bitmap.
+ *
+ * If a caller needs to make sure all dirty ptes are picked up and none
+ * additional are added, it first needs to write-protect the address-space
+ * range and make sure new writers are blocked in page_mkwrite() or
+ * pfn_mkwrite(). And then after a TLB flush following the write-protection
+ * pick up all dirty bits.
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of dirty ptes actually cleaned.
+ */
+unsigned long apply_as_clean(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr,
+ pgoff_t bitmap_pgoff,
+ unsigned long *bitmap,
+ pgoff_t *start,
+ pgoff_t *end)
+{
+ bool none_set = (*start >= *end);
+ static const struct as_walk_ops ops = { .pte_entry = apply_pt_clean };
+ struct as_walk_clean clean = {
+ .base = { .total = 0, },
+ .bitmap_pgoff = bitmap_pgoff,
+ .bitmap = bitmap,
+ .start = none_set ? nr : *start,
+ .end = none_set ? 0 : *end,
+ };
+ unsigned long ret = apply_as_range(mapping, first_index, nr, &ops,
+ &clean.base);
+ *start = clean.start;
+ *end = clean.end;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(apply_as_clean);