@@ -126,13 +126,12 @@ static void page_cache_delete(struct address_space *mapping,
/* hugetlb pages are represented by a single entry in the xarray */
if (!PageHuge(page)) {
- xas_set_order(&xas, page->index, compound_order(page));
- nr = compound_nr(page);
+ xas_set_order(&xas, page->index, thp_order(page));
+ nr = thp_nr_pages(page);
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(nr != 1 && shadow, page);
xas_store(&xas, shadow);
xas_init_marks(&xas);
@@ -322,19 +321,12 @@ static void page_cache_delete_batch(struct address_space *mapping,
WARN_ON_ONCE(!PageLocked(page));
- if (page->index == xas.xa_index)
- page->mapping = NULL;
+ page->mapping = NULL;
/* Leave page->index set: truncation lookup relies on it */
- /*
- * Move to the next page in the vector if this is a regular
- * page or the index is of the last sub-page of this compound
- * page.
- */
- if (page->index + compound_nr(page) - 1 == xas.xa_index)
- i++;
+ i++;
xas_store(&xas, NULL);
- total_pages++;
+ total_pages += thp_nr_pages(page);
}
mapping->nrpages -= total_pages;
}
@@ -851,20 +843,24 @@ static int __add_to_page_cache_locked(struct page *page,
}
do {
- xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
- xas_store(&xas, page);
- if (xas_error(&xas))
- goto unlock;
+ unsigned long exceptional = 0;
- if (xa_is_value(old)) {
- mapping->nrexceptional--;
+ xas_lock_irq(&xas);
+ xas_for_each_conflict(&xas, old) {
+ if (!xa_is_value(old)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ exceptional++;
if (shadowp)
*shadowp = old;
}
- mapping->nrpages++;
+
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ goto unlock;
+ mapping->nrexceptional -= exceptional;
+ mapping->nrpages += nr;
/* hugetlb pages do not participate in page cache accounting */
if (!huge)
@@ -2603,6 +2603,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct deferred_split *ds_queue = get_deferred_split_queue(head);
+ XA_STATE_ORDER(xas, &head->mapping->i_pages, head->index,
+ compound_order(head));
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
@@ -2667,19 +2669,28 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
+ if (mapping) {
+ /* XXX: Need better GFP flags here */
+ xas_split_alloc(&xas, head, 0, GFP_ATOMIC);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out_unlock;
+ }
+ }
+
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(&pgdata->lru_lock, flags);
if (mapping) {
- XA_STATE(xas, &mapping->i_pages, page_index(head));
-
/*
* Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
- xa_lock(&mapping->i_pages);
+ xas_lock(&xas);
+ xas_reset(&xas);
if (xas_load(&xas) != head)
goto fail;
+ xas_split(&xas, head, 0);
}
/* Prevent deferred_split_scan() touching ->_refcount */
@@ -2717,7 +2728,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
- xa_unlock(&mapping->i_pages);
+ xas_unlock(&xas);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
remap_page(head);
ret = -EBUSY;
@@ -2731,6 +2742,8 @@ fail: if (mapping)
if (mapping)
i_mmap_unlock_read(mapping);
out:
+ /* Free any memory we didn't use */
+ xas_nomem(&xas, 0);
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
}
@@ -1638,7 +1638,10 @@ static void collapse_file(struct mm_struct *mm,
}
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
- /* This will be less messy when we use multi-index entries */
+ /*
+ * Ensure we have slots for all the pages in the range. This is
+ * almost certainly a no-op because most of the pages must be present
+ */
do {
xas_lock_irq(&xas);
xas_create_range(&xas);
@@ -1844,6 +1847,9 @@ static void collapse_file(struct mm_struct *mm,
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
+ /* Join all the small entries into a single multi-index entry */
+ xas_set_order(&xas, start, HPAGE_PMD_ORDER);
+ xas_store(&xas, new_page);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
@@ -1965,6 +1971,10 @@ static void khugepaged_scan_file(struct mm_struct *mm,
continue;
}
+ /*
+ * XXX: khugepaged should compact smaller compound pages
+ * into a PMD sized page
+ */
if (PageTransCompound(page)) {
result = SCAN_PAGE_COMPOUND;
break;
@@ -608,7 +608,6 @@ static int shmem_add_to_page_cache(struct page *page,
struct mm_struct *charge_mm)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
- unsigned long i = 0;
unsigned long nr = compound_nr(page);
int error;
@@ -638,17 +637,11 @@ static int shmem_add_to_page_cache(struct page *page,
void *entry;
xas_lock_irq(&xas);
entry = xas_find_conflict(&xas);
- if (entry != expected)
+ if (entry != expected) {
xas_set_err(&xas, -EEXIST);
- xas_create_range(&xas);
- if (xas_error(&xas))
goto unlock;
-next:
- xas_store(&xas, page);
- if (++i < nr) {
- xas_next(&xas);
- goto next;
}
+ xas_store(&xas, page);
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
__inc_node_page_state(page, NR_SHMEM_THPS);
We currently store order-N THPs as 2^N consecutive entries. While this consumes rather more memory than necessary, it also turns out to be buggy for filesystems which track dirty pages as a writeback operation which starts in the middle of a dirty THP will not notice as the dirty bit is only set on the head index. With multi-index entries, the dirty bit will be found on the head index. This does end up simplifying the page cache slightly, although not as much as I had hoped. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> --- mm/filemap.c | 42 +++++++++++++++++++----------------------- mm/huge_memory.c | 21 +++++++++++++++++---- mm/khugepaged.c | 12 +++++++++++- mm/shmem.c | 11 ++--------- 4 files changed, 49 insertions(+), 37 deletions(-)