[v2,19/22] mpool: add support to mmap arbitrary collection of mblocks

Message ID	20201012162736.65241-20-nmeeramohide@micron.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=jKGj=DT=vger.kernel.org=linux-block-owner@kernel.org> Received-SPF: Pass (protection.outlook.com: domain of micron.com designates 137.201.242.130 as permitted sender) receiver=protection.outlook.com; client-ip=137.201.242.130; helo=mail.micron.com; From: Nabeel M Mohamed <nmeeramohide@micron.com> To: <linux-kernel@vger.kernel.org>, <linux-block@vger.kernel.org>, <linux-nvme@lists.infradead.org>, <linux-mm@kvack.org>, <linux-nvdimm@lists.01.org> CC: <smoyer@micron.com>, <gbecker@micron.com>, <plabat@micron.com>, <jgroves@micron.com>, Nabeel M Mohamed <nmeeramohide@micron.com> Subject: [PATCH v2 19/22] mpool: add support to mmap arbitrary collection of mblocks Date: Mon, 12 Oct 2020 11:27:33 -0500 Message-ID: <20201012162736.65241-20-nmeeramohide@micron.com> In-Reply-To: <20201012162736.65241-1-nmeeramohide@micron.com> References: <20201012162736.65241-1-nmeeramohide@micron.com> MIME-Version: 1.0 Content-Type: text/plain Precedence: bulk
Series	add Object Storage Media Pool (mpool) \| expand [v2,00/22] add Object Storage Media Pool (mpool) [v2,01/22] mpool: add utility routines and ioctl definitions [v2,02/22] mpool: add in-memory struct definitions [v2,03/22] mpool: add on-media struct definitions [v2,04/22] mpool: add pool drive component which handles mpool IO using the block layer API [v2,05/22] mpool: add space map component which manages free space on mpool devices [v2,06/22] mpool: add on-media pack, unpack and upgrade routines [v2,07/22] mpool: add superblock management routines [v2,08/22] mpool: add pool metadata routines to manage object lifecycle and IO [v2,09/22] mpool: add mblock lifecycle management and IO routines [v2,10/22] mpool: add mlog IO utility routines [v2,11/22] mpool: add mlog lifecycle management and IO routines [v2,12/22] mpool: add metadata container or mlog-pair framework [v2,13/22] mpool: add utility routines for mpool lifecycle management [v2,14/22] mpool: add pool metadata routines to create persistent mpools [v2,15/22] mpool: add mpool lifecycle management routines [v2,16/22] mpool: add mpool control plane utility routines [v2,17/22] mpool: add mpool lifecycle management ioctls [v2,18/22] mpool: add object lifecycle management ioctls [v2,19/22] mpool: add support to mmap arbitrary collection of mblocks [v2,20/22] mpool: add support to proactively evict cached mblock data from the page-cache [v2,21/22] mpool: add documentation [v2,22/22] mpool: add Kconfig and Makefile

diff --git a/drivers/mpool/init.c b/drivers/mpool/init.c index 126c6c7142b5..b1fe3286773a 100644 --- a/drivers/mpool/init.c +++ b/drivers/mpool/init.c @@ -12,11 +12,20 @@ #include "smap.h" #include "pmd_obj.h" #include "sb.h" +#include "mcache.h" #include "mpctl.h" /* * Module params... */ +unsigned int xvm_max __read_mostly = 1048576 * 128; +module_param(xvm_max, uint, 0444); +MODULE_PARM_DESC(xvm_max, " max extended VMA regions"); + +unsigned int xvm_size_max __read_mostly = 30; +module_param(xvm_size_max, uint, 0444); +MODULE_PARM_DESC(xvm_size_max, " max extended VMA size log2"); + unsigned int maxunits __read_mostly = 1024; module_param(maxunits, uint, 0444); MODULE_PARM_DESC(maxunits, " max mpools"); @@ -40,6 +49,7 @@ MODULE_PARM_DESC(chunk_size_kb, "Chunk size (in KiB) for device I/O"); static void mpool_exit_impl(void) { mpctl_exit(); + mcache_exit(); pmd_exit(); smap_exit(); sb_exit(); @@ -82,6 +92,12 @@ static __init int mpool_init(void) goto errout; } + rc = mcache_init(); + if (rc) { + errmsg = "mcache init failed"; + goto errout; + } + rc = mpctl_init(); if (rc) { errmsg = "mpctl init failed"; diff --git a/drivers/mpool/init.h b/drivers/mpool/init.h index 3d8f809a5e45..507d43a55c01 100644 --- a/drivers/mpool/init.h +++ b/drivers/mpool/init.h @@ -6,6 +6,8 @@ #ifndef MPOOL_INIT_H #define MPOOL_INIT_H +extern unsigned int xvm_max; +extern unsigned int xvm_size_max; extern unsigned int maxunits; extern unsigned int rwsz_max_mb; extern unsigned int rwconc_max; diff --git a/drivers/mpool/mcache.c b/drivers/mpool/mcache.c new file mode 100644 index 000000000000..07c79615ecf1 --- /dev/null +++ b/drivers/mpool/mcache.c @@ -0,0 +1,1029 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015-2020 Micron Technology, Inc. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/memcontrol.h> +#include <linux/kref.h> +#include <linux/migrate.h> +#include <linux/delay.h> +#include <linux/uio.h> +#include <linux/fadvise.h> +#include <linux/prefetch.h> + +#include "mpool_ioctl.h" + +#include "mp.h" +#include "mpool_printk.h" +#include "assert.h" + +#include "mpctl.h" + +#ifndef lru_to_page +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) +#endif + +/* + * MPC_RA_IOV_MAX - Max pages per call to mblock read by a readahead + * request. Be careful about increasing this as it directly adds + * (n * 24) bytes to the stack frame of mpc_readpages_cb(). + */ +#define MPC_RA_IOV_MAX (8) + +#define NODEV MKDEV(0, 0) /* Non-existent device */ + +/* + * Arguments required to initiate an asynchronous call to mblock_read() + * and which must also be preserved across that call. + * + * Note: We could make things more efficient by changing a_pagev[] + * to a struct kvec if mblock_read() would guarantee that it will + * not alter the given iovec. + */ +struct readpage_args { + void *a_xvm; + struct mblock_descriptor *a_mbdesc; + u64 a_mboffset; + int a_pagec; + struct page *a_pagev[]; +}; + +struct readpage_work { + struct work_struct w_work; + struct readpage_args w_args; +}; + +static void mpc_xvm_put(struct mpc_xvm *xvm); + +static int mpc_readpage_impl(struct page *page, struct mpc_xvm *map); + +/* The following structures are initialized at the end of this file. */ +static const struct vm_operations_struct mpc_vops_default; +const struct address_space_operations mpc_aops_default; + +static struct workqueue_struct *mpc_wq_trunc __read_mostly; +static struct workqueue_struct *mpc_wq_rav[4] __read_mostly; + +static size_t mpc_xvm_cachesz[2] __read_mostly; +static struct kmem_cache *mpc_xvm_cache[2] __read_mostly; + +static struct workqueue_struct *mpc_rgn2wq(uint rgn) +{ + return mpc_wq_rav[rgn % ARRAY_SIZE(mpc_wq_rav)]; +} + +static int mpc_rgnmap_isorphan(int rgn, void *item, void *data) +{ + struct mpc_xvm *xvm = item; + void **headp = data; + + if (xvm && kref_read(&xvm->xvm_ref) == 1 && !atomic_read(&xvm->xvm_opened)) { + idr_replace(&xvm->xvm_rgnmap->rm_root, NULL, rgn); + xvm->xvm_next = *headp; + *headp = xvm; + } + + return ITERCB_NEXT; +} + +void mpc_rgnmap_flush(struct mpc_rgnmap *rm) +{ + struct mpc_xvm *head = NULL, *xvm; + + if (!rm) + return; + + /* Wait for all mpc_xvm_free_cb() callbacks to complete... */ + flush_workqueue(mpc_wq_trunc); + + /* + * Build a list of all orphaned XVMs and release their birth + * references (i.e., XVMs that were created but never mmapped). + */ + mutex_lock(&rm->rm_lock); + idr_for_each(&rm->rm_root, mpc_rgnmap_isorphan, &head); + mutex_unlock(&rm->rm_lock); + + while ((xvm = head)) { + head = xvm->xvm_next; + mpc_xvm_put(xvm); + } +} + +static struct mpc_xvm *mpc_xvm_lookup(struct mpc_rgnmap *rm, uint key) +{ + struct mpc_xvm *xvm; + + mutex_lock(&rm->rm_lock); + xvm = idr_find(&rm->rm_root, key); + if (xvm && !kref_get_unless_zero(&xvm->xvm_ref)) + xvm = NULL; + mutex_unlock(&rm->rm_lock); + + return xvm; +} + +void mpc_xvm_free(struct mpc_xvm *xvm) +{ + struct mpc_rgnmap *rm; + + ASSERT((u32)(uintptr_t)xvm == xvm->xvm_magic); + + rm = xvm->xvm_rgnmap; + + mutex_lock(&rm->rm_lock); + idr_remove(&rm->rm_root, xvm->xvm_rgn); + mutex_unlock(&rm->rm_lock); + + xvm->xvm_magic = 0xbadcafe; + xvm->xvm_rgn = -1; + + kmem_cache_free(xvm->xvm_cache, xvm); + + atomic_dec(&rm->rm_rgncnt); +} + +static void mpc_xvm_free_cb(struct work_struct *work) +{ + struct mpc_xvm *xvm = container_of(work, typeof(*xvm), xvm_work); + + mpc_xvm_free(xvm); +} + +static void mpc_xvm_get(struct mpc_xvm *xvm) +{ + kref_get(&xvm->xvm_ref); +} + +static void mpc_xvm_release(struct kref *kref) +{ + struct mpc_xvm *xvm = container_of(kref, struct mpc_xvm, xvm_ref); + struct mpc_rgnmap *rm = xvm->xvm_rgnmap; + int i; + + ASSERT((u32)(uintptr_t)xvm == xvm->xvm_magic); + + mutex_lock(&rm->rm_lock); + ASSERT(kref_read(kref) == 0); + idr_replace(&rm->rm_root, NULL, xvm->xvm_rgn); + mutex_unlock(&rm->rm_lock); + + /* + * Wait for all in-progress readaheads to complete + * before we drop our mblock references. + */ + if (atomic_add_return(WQ_MAX_ACTIVE, &xvm->xvm_rabusy) > WQ_MAX_ACTIVE) + flush_workqueue(mpc_rgn2wq(xvm->xvm_rgn)); + + for (i = 0; i < xvm->xvm_mbinfoc; ++i) + mblock_put(xvm->xvm_mbinfov[i].mbdesc); + + INIT_WORK(&xvm->xvm_work, mpc_xvm_free_cb); + queue_work(mpc_wq_trunc, &xvm->xvm_work); +} + +static void mpc_xvm_put(struct mpc_xvm *xvm) +{ + kref_put(&xvm->xvm_ref, mpc_xvm_release); +} + +/* + * VM operations + */ + +static void mpc_vm_open(struct vm_area_struct *vma) +{ + mpc_xvm_get(vma->vm_private_data); +} + +static void mpc_vm_close(struct vm_area_struct *vma) +{ + mpc_xvm_put(vma->vm_private_data); +} + +static int mpc_alloc_and_readpage(struct vm_area_struct *vma, pgoff_t offset, gfp_t gfp) +{ + struct address_space *mapping; + struct file *file; + struct page *page; + int rc; + + page = __page_cache_alloc(gfp | __GFP_NOWARN); + if (!page) + return -ENOMEM; + + file = vma->vm_file; + mapping = file->f_mapping; + + rc = add_to_page_cache_lru(page, mapping, offset, gfp & GFP_KERNEL); + if (rc == 0) + rc = mpc_readpage_impl(page, vma->vm_private_data); + else if (rc == -EEXIST) + rc = 0; + + put_page(page); + + return rc; +} + +static bool mpc_lock_page_or_retry(struct page *page, struct mm_struct *mm, uint flags) +{ + might_sleep(); + + if (trylock_page(page)) + return true; + + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (flags & FAULT_FLAG_RETRY_NOWAIT) + return false; + + mmap_read_unlock(mm); + /* _killable version is not exported by the kernel. */ + wait_on_page_locked(page); + return false; + } + + if (flags & FAULT_FLAG_KILLABLE) { + int rc; + + rc = lock_page_killable(page); + if (rc) { + mmap_read_unlock(mm); + return false; + } + } else { + lock_page(page); + } + + return true; +} + +static int mpc_handle_page_error(struct page *page, struct vm_area_struct *vma) +{ + int rc; + + ClearPageError(page); + + rc = mpc_readpage_impl(page, vma->vm_private_data); + if (rc == 0) { + wait_on_page_locked(page); + if (!PageUptodate(page)) + rc = -EIO; + } + + put_page(page); + + return rc; +} + +static vm_fault_t mpc_vm_fault_impl(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping; + struct inode *inode; + struct page *page; + vm_fault_t vmfrc; + pgoff_t offset; + loff_t size; + + mapping = vma->vm_file->f_mapping; + inode = mapping->host; + offset = vmf->pgoff; + vmfrc = 0; + + size = round_up(i_size_read(inode), PAGE_SIZE); + if (offset >= (size >> PAGE_SHIFT)) + return VM_FAULT_SIGBUS; + +retry_find: + page = find_get_page(mapping, offset); + if (!page) { + int rc = mpc_alloc_and_readpage(vma, offset, mapping_gfp_mask(mapping)); + + if (rc < 0) + return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + + vmfrc = VM_FAULT_MAJOR; + goto retry_find; + } + + /* At this point, page is not locked but has a ref. */ + if (vmfrc == VM_FAULT_MAJOR) + count_vm_event(PGMAJFAULT); + + if (!mpc_lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + put_page(page); + return vmfrc | VM_FAULT_RETRY; + } + + /* At this point, page is locked with a ref. */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + + VM_BUG_ON_PAGE(page->index != offset, page); + + if (unlikely(!PageUptodate(page))) { + int rc = mpc_handle_page_error(page, vma); + + /* At this point, page is not locked and has no ref. */ + if (rc) + return VM_FAULT_SIGBUS; + goto retry_find; + } + + /* Page is locked with a ref. */ + vmf->page = page; + + return vmfrc | VM_FAULT_LOCKED; +} + +static vm_fault_t mpc_vm_fault(struct vm_fault *vmf) +{ + return mpc_vm_fault_impl(vmf->vma, vmf); +} + +/* + * MPCTL address-space operations. + */ + +static int mpc_readpage_impl(struct page *page, struct mpc_xvm *xvm) +{ + struct mpc_mbinfo *mbinfo; + struct kvec iov[1]; + off_t offset; + uint mbnum; + int rc; + + offset = page->index << PAGE_SHIFT; + offset %= (1ul << xvm_size_max); + + mbnum = offset / xvm->xvm_bktsz; + if (mbnum >= xvm->xvm_mbinfoc) { + unlock_page(page); + return -EINVAL; + } + + mbinfo = xvm->xvm_mbinfov + mbnum; + offset %= xvm->xvm_bktsz; + + if (offset >= mbinfo->mblen) { + unlock_page(page); + return -EINVAL; + } + + iov[0].iov_base = page_address(page); + iov[0].iov_len = PAGE_SIZE; + + rc = mblock_read(xvm->xvm_mpdesc, mbinfo->mbdesc, iov, 1, offset, PAGE_SIZE); + if (rc) { + unlock_page(page); + return rc; + } + + if (xvm->xvm_hcpagesp) + atomic64_inc(xvm->xvm_hcpagesp); + atomic64_inc(&xvm->xvm_nrpages); + + SetPagePrivate(page); + set_page_private(page, (ulong)xvm); + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + +#define MPC_RPARGSBUFSZ \ + (sizeof(struct readpage_args) + MPC_RA_IOV_MAX * sizeof(void *)) + +/** + * mpc_readpages_cb() - mpc_readpages() callback + * @work: w_work.work from struct readpage_work + * + * The incoming arguments are in the first page (a_pagev[0]) which + * we are about to overwrite, so we copy them to the stack. + */ +static void mpc_readpages_cb(struct work_struct *work) +{ + struct kvec iovbuf[MPC_RA_IOV_MAX]; + char argsbuf[MPC_RPARGSBUFSZ]; + struct readpage_args *args = (void *)argsbuf; + struct readpage_work *w; + struct mpc_xvm *xvm; + struct kvec *iov = iovbuf; + size_t argssz; + int pagec, rc, i; + + w = container_of(work, struct readpage_work, w_work); + + pagec = w->w_args.a_pagec; + argssz = sizeof(*args) + sizeof(args->a_pagev[0]) * pagec; + + ASSERT(pagec <= ARRAY_SIZE(iovbuf)); + ASSERT(argssz <= sizeof(argsbuf)); + + memcpy(args, &w->w_args, argssz); + w = NULL; /* Do not touch! */ + + xvm = args->a_xvm; + + /* + * Synchronize with mpc_xvm_put() to prevent dropping our + * mblock references while there are reads in progress. + */ + if (atomic_inc_return(&xvm->xvm_rabusy) > WQ_MAX_ACTIVE) { + rc = -ENXIO; + goto errout; + } + + for (i = 0; i < pagec; ++i) { + iov[i].iov_base = page_address(args->a_pagev[i]); + iov[i].iov_len = PAGE_SIZE; + } + + rc = mblock_read(xvm->xvm_mpdesc, args->a_mbdesc, iov, + pagec, args->a_mboffset, pagec << PAGE_SHIFT); + if (rc) + goto errout; + + if (xvm->xvm_hcpagesp) + atomic64_add(pagec, xvm->xvm_hcpagesp); + atomic64_add(pagec, &xvm->xvm_nrpages); + atomic_dec(&xvm->xvm_rabusy); + + for (i = 0; i < pagec; ++i) { + struct page *page = args->a_pagev[i]; + + SetPagePrivate(page); + set_page_private(page, (ulong)xvm); + SetPageUptodate(page); + + unlock_page(page); + put_page(page); + } + + return; + +errout: + atomic_dec(&xvm->xvm_rabusy); + + for (i = 0; i < pagec; ++i) { + unlock_page(args->a_pagev[i]); + put_page(args->a_pagev[i]); + } +} + +static int mpc_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, uint nr_pages) +{ + struct workqueue_struct *wq; + struct readpage_work *w; + struct work_struct *work; + struct mpc_mbinfo *mbinfo; + struct mpc_unit *unit; + struct mpc_xvm *xvm; + struct page *page; + off_t offset, mbend; + uint mbnum, iovmax, i; + uint ra_pages_max; + ulong index; + gfp_t gfp; + u32 key; + int rc; + + unit = file->private_data; + + ra_pages_max = unit->un_ra_pages_max; + if (ra_pages_max < 1) + return 0; + + page = lru_to_page(pages); + offset = page->index << PAGE_SHIFT; + index = page->index; + work = NULL; + w = NULL; + + key = offset >> xvm_size_max; + + /* + * The idr value here (xvm) is pinned for the lifetime of the address map. + * Therefore, we can exit the rcu read-side critsec without worry that xvm will be + * destroyed before put_page() has been called on each and every page in the given + * list of pages. + */ + rcu_read_lock(); + xvm = idr_find(&unit->un_rgnmap.rm_root, key); + rcu_read_unlock(); + + if (!xvm) + return 0; + + offset %= (1ul << xvm_size_max); + + mbnum = offset / xvm->xvm_bktsz; + if (mbnum >= xvm->xvm_mbinfoc) + return 0; + + mbinfo = xvm->xvm_mbinfov + mbnum; + + mbend = mbnum * xvm->xvm_bktsz + mbinfo->mblen; + iovmax = MPC_RA_IOV_MAX; + + gfp = mapping_gfp_mask(mapping) & GFP_KERNEL; + wq = mpc_rgn2wq(xvm->xvm_rgn); + + nr_pages = min_t(uint, nr_pages, ra_pages_max); + + for (i = 0; i < nr_pages; ++i) { + page = lru_to_page(pages); + offset = page->index << PAGE_SHIFT; + offset %= (1ul << xvm_size_max); + + /* Don't read past the end of the mblock. */ + if (offset >= mbend) + break; + + /* mblock reads must be logically contiguous. */ + if (page->index != index && work) { + queue_work(wq, work); + work = NULL; + } + + index = page->index + 1; /* next expected page index */ + + prefetchw(&page->flags); + list_del(&page->lru); + + rc = add_to_page_cache_lru(page, mapping, page->index, gfp); + if (rc) { + if (work) { + queue_work(wq, work); + work = NULL; + } + put_page(page); + continue; + } + + if (!work) { + w = page_address(page); + INIT_WORK(&w->w_work, mpc_readpages_cb); + w->w_args.a_xvm = xvm; + w->w_args.a_mbdesc = mbinfo->mbdesc; + w->w_args.a_mboffset = offset % xvm->xvm_bktsz; + w->w_args.a_pagec = 0; + work = &w->w_work; + + iovmax = MPC_RA_IOV_MAX; + iovmax -= page->index % MPC_RA_IOV_MAX; + } + + w->w_args.a_pagev[w->w_args.a_pagec++] = page; + + /* + * Restrict batch size to the number of struct kvecs + * that will fit into a page (minus our header). + */ + if (w->w_args.a_pagec >= iovmax) { + queue_work(wq, work); + work = NULL; + } + } + + if (work) + queue_work(wq, work); + + return 0; +} + +/** + * mpc_releasepage() - Linux VM calls the release page when pages are released. + * @page: + * @gfp: + * + * The function is added as part of tracking incoming and outgoing pages. + */ +static int mpc_releasepage(struct page *page, gfp_t gfp) +{ + struct mpc_xvm *xvm; + + if (!PagePrivate(page)) + return 0; + + xvm = (void *)page_private(page); + if (!xvm) + return 0; + + ClearPagePrivate(page); + set_page_private(page, 0); + + ASSERT((u32)(uintptr_t)xvm == xvm->xvm_magic); + + if (xvm->xvm_hcpagesp) + atomic64_dec(xvm->xvm_hcpagesp); + atomic64_dec(&xvm->xvm_nrpages); + + return 1; +} + +static void mpc_invalidatepage(struct page *page, uint offset, uint length) +{ + mpc_releasepage(page, 0); +} + +/** + * mpc_migratepage() - Callback for handling page migration. + * @mapping: + * @newpage: + * @page: + * @mode: + * + * The drivers having private pages are supplying this callback. + * Not sure the page migration releases or invalidates the page being migrated, + * or else the tracking of incoming and outgoing pages will be in trouble. The + * callback is added to deal with uncertainties around migration. The migration + * will be declined so long as the page is private and it belongs to mpctl. + */ +static int mpc_migratepage(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + ASSERT(PageLocked(page)); + + return migrate_page(mapping, newpage, page, mode); +} + +int mpc_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct mpc_unit *unit = fp->private_data; + struct mpc_xvm *xvm; + off_t off; + ulong len; + u32 key; + + off = vma->vm_pgoff << PAGE_SHIFT; + len = vma->vm_end - vma->vm_start - 1; + + /* Verify that the request does not cross an xvm region boundary. */ + if ((off >> xvm_size_max) != ((off + len) >> xvm_size_max)) + return -EINVAL; + + /* Acquire a reference on the region map for this region. */ + key = off >> xvm_size_max; + + xvm = mpc_xvm_lookup(&unit->un_rgnmap, key); + if (!xvm) + return -EINVAL; + + /* + * Drop the birth ref on first open so that the final call + * to mpc_vm_close() will cause the vma to be destroyed. + */ + if (atomic_inc_return(&xvm->xvm_opened) == 1) + mpc_xvm_put(xvm); + + vma->vm_ops = &mpc_vops_default; + + vma->vm_flags &= ~(VM_RAND_READ | VM_SEQ_READ); + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + + vma->vm_flags = (VM_DONTEXPAND | VM_DONTDUMP | VM_NORESERVE); + vma->vm_flags |= VM_MAYREAD | VM_READ | VM_RAND_READ; + + vma->vm_private_data = xvm; + + fp->f_ra.ra_pages = unit->un_ra_pages_max; + fp->f_mode |= FMODE_RANDOM; + + return 0; +} + +/** + * mpc_fadvise() - + * + * mpc_fadvise() currently handles only POSIX_FADV_WILLNEED. + * + * The code path that leads here is: madvise_willneed() -> vfs_fadvise() -> mpc_fadvise() + */ +int mpc_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + pgoff_t start, end; + + if (!file) + return -EINVAL; + + if (advice != POSIX_FADV_WILLNEED) + return -EOPNOTSUPP; + + start = offset >> PAGE_SHIFT; + end = (offset + len - 1) >> PAGE_SHIFT; + + if (end < start) + return -EINVAL; + + /* To force page cache readahead */ + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); + + page_cache_sync_readahead(file->f_mapping, &file->f_ra, file, start, end - start + 1); + + return 0; +} + +/** + * mpioc_xvm_create() - create an extended VMA map (AKA mcache map) + * @unit: + * @arg: + */ +int mpioc_xvm_create(struct mpc_unit *unit, struct mpool_descriptor *mp, struct mpioc_vma *ioc) +{ + struct mpc_mbinfo *mbinfov; + struct kmem_cache *cache; + struct mpc_rgnmap *rm; + struct mpc_xvm *xvm; + size_t largest, sz; + uint mbidc, mult; + u64 *mbidv; + int rc, i; + + if (!unit || !unit->un_mapping || !ioc) + return -EINVAL; + + if (ioc->im_mbidc < 1) + return -EINVAL; + + if (ioc->im_advice > MPC_VMA_PINNED) + return -EINVAL; + + mult = 1; + if (ioc->im_advice == MPC_VMA_WARM) + mult = 10; + else if (ioc->im_advice == MPC_VMA_HOT) + mult = 100; + + mbidc = ioc->im_mbidc; + + sz = sizeof(*xvm) + sizeof(*mbinfov) * mbidc; + if (sz > mpc_xvm_cachesz[1]) + return -EINVAL; + else if (sz > mpc_xvm_cachesz[0]) + cache = mpc_xvm_cache[1]; + else + cache = mpc_xvm_cache[0]; + + sz = mbidc * sizeof(mbidv[0]); + + mbidv = kmalloc(sz, GFP_KERNEL); + if (!mbidv) + return -ENOMEM; + + rc = copy_from_user(mbidv, ioc->im_mbidv, sz); + if (rc) { + kfree(mbidv); + return -EFAULT; + } + + xvm = kmem_cache_zalloc(cache, GFP_KERNEL); + if (!xvm) { + kfree(mbidv); + return -ENOMEM; + } + + xvm->xvm_magic = (u32)(uintptr_t)xvm; + xvm->xvm_mbinfoc = mbidc; + xvm->xvm_mpdesc = mp; + + xvm->xvm_mapping = unit->un_mapping; + xvm->xvm_rgnmap = &unit->un_rgnmap; + xvm->xvm_advice = ioc->im_advice; + kref_init(&xvm->xvm_ref); + xvm->xvm_cache = cache; + atomic_set(&xvm->xvm_opened, 0); + + atomic64_set(&xvm->xvm_nrpages, 0); + atomic_set(&xvm->xvm_rabusy, 0); + + largest = 0; + + mbinfov = xvm->xvm_mbinfov; + + for (i = 0; i < mbidc; ++i) { + struct mpc_mbinfo *mbinfo = mbinfov + i; + struct mblock_props props; + + rc = mblock_find_get(mp, mbidv[i], 1, &props, &mbinfo->mbdesc); + if (rc) { + mbidc = i; + goto errout; + } + + mbinfo->mblen = ALIGN(props.mpr_write_len, PAGE_SIZE); + mbinfo->mbmult = mult; + atomic64_set(&mbinfo->mbatime, 0); + + largest = max_t(size_t, largest, mbinfo->mblen); + } + + xvm->xvm_bktsz = roundup_pow_of_two(largest); + + if (xvm->xvm_bktsz * mbidc > (1ul << xvm_size_max)) { + rc = -E2BIG; + goto errout; + } + + rm = &unit->un_rgnmap; + + mutex_lock(&rm->rm_lock); + xvm->xvm_rgn = idr_alloc(&rm->rm_root, NULL, 1, -1, GFP_KERNEL); + if (xvm->xvm_rgn < 1) { + mutex_unlock(&rm->rm_lock); + + rc = xvm->xvm_rgn ?: -EINVAL; + goto errout; + } + + ioc->im_offset = (ulong)xvm->xvm_rgn << xvm_size_max; + ioc->im_bktsz = xvm->xvm_bktsz; + ioc->im_len = xvm->xvm_bktsz * mbidc; + ioc->im_len = ALIGN(ioc->im_len, (1ul << xvm_size_max)); + + atomic_inc(&rm->rm_rgncnt); + + idr_replace(&rm->rm_root, xvm, xvm->xvm_rgn); + mutex_unlock(&rm->rm_lock); + +errout: + if (rc) { + for (i = 0; i < mbidc; ++i) + mblock_put(mbinfov[i].mbdesc); + kmem_cache_free(cache, xvm); + } + + kfree(mbidv); + + return rc; +} + +/** + * mpioc_xvm_destroy() - destroy an extended VMA + * @unit: + * @arg: + */ +int mpioc_xvm_destroy(struct mpc_unit *unit, struct mpioc_vma *ioc) +{ + struct mpc_rgnmap *rm; + struct mpc_xvm *xvm; + u64 rgn; + + if (!unit || !ioc) + return -EINVAL; + + rgn = ioc->im_offset >> xvm_size_max; + rm = &unit->un_rgnmap; + + mutex_lock(&rm->rm_lock); + xvm = idr_find(&rm->rm_root, rgn); + if (xvm && kref_read(&xvm->xvm_ref) == 1 && !atomic_read(&xvm->xvm_opened)) + idr_remove(&rm->rm_root, rgn); + else + xvm = NULL; + mutex_unlock(&rm->rm_lock); + + if (xvm) + mpc_xvm_put(xvm); + + return 0; +} + +int mpioc_xvm_purge(struct mpc_unit *unit, struct mpioc_vma *ioc) +{ + struct mpc_xvm *xvm; + u64 rgn; + + if (!unit || !ioc) + return -EINVAL; + + rgn = ioc->im_offset >> xvm_size_max; + + xvm = mpc_xvm_lookup(&unit->un_rgnmap, rgn); + if (!xvm) + return -ENOENT; + + mpc_xvm_put(xvm); + + return 0; +} + +int mpioc_xvm_vrss(struct mpc_unit *unit, struct mpioc_vma *ioc) +{ + struct mpc_xvm *xvm; + u64 rgn; + + if (!unit || !ioc) + return -EINVAL; + + rgn = ioc->im_offset >> xvm_size_max; + + xvm = mpc_xvm_lookup(&unit->un_rgnmap, rgn); + if (!xvm) + return -ENOENT; + + ioc->im_vssp = mpc_xvm_pglen(xvm); + ioc->im_rssp = atomic64_read(&xvm->xvm_nrpages); + + mpc_xvm_put(xvm); + + return 0; +} + +int mcache_init(void) +{ + size_t sz; + int rc, i; + + xvm_max = clamp_t(uint, xvm_max, 1024, 1u << 30); + xvm_size_max = clamp_t(ulong, xvm_size_max, 27, 32); + + sz = sizeof(struct mpc_mbinfo) * 8; + mpc_xvm_cachesz[0] = sizeof(struct mpc_xvm) + sz; + + mpc_xvm_cache[0] = kmem_cache_create("mpool_xvm_0", mpc_xvm_cachesz[0], 0, + SLAB_HWCACHE_ALIGN | SLAB_POISON, NULL); + if (!mpc_xvm_cache[0]) { + rc = -ENOMEM; + mp_pr_err("mpc xvm cache 0 create failed", rc); + return rc; + } + + sz = sizeof(struct mpc_mbinfo) * 32; + mpc_xvm_cachesz[1] = sizeof(struct mpc_xvm) + sz; + + mpc_xvm_cache[1] = kmem_cache_create("mpool_xvm_1", mpc_xvm_cachesz[1], 0, + SLAB_HWCACHE_ALIGN | SLAB_POISON, NULL); + if (!mpc_xvm_cache[1]) { + rc = -ENOMEM; + mp_pr_err("mpc xvm cache 1 create failed", rc); + goto errout; + } + + mpc_wq_trunc = alloc_workqueue("mpc_wq_trunc", WQ_UNBOUND, 16); + if (!mpc_wq_trunc) { + rc = -ENOMEM; + mp_pr_err("trunc workqueue alloc failed", rc); + goto errout; + } + + for (i = 0; i < ARRAY_SIZE(mpc_wq_rav); ++i) { + int maxactive = 16; + char name[16]; + + snprintf(name, sizeof(name), "mpc_wq_ra%d", i); + + mpc_wq_rav[i] = alloc_workqueue(name, 0, maxactive); + if (!mpc_wq_rav[i]) { + rc = -ENOMEM; + mp_pr_err("mpctl ra workqueue alloc failed", rc); + goto errout; + } + } + + return 0; + +errout: + mcache_exit(); + return rc; +} + +void mcache_exit(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mpc_wq_rav); ++i) { + if (mpc_wq_rav[i]) + destroy_workqueue(mpc_wq_rav[i]); + mpc_wq_rav[i] = NULL; + } + + if (mpc_wq_trunc) + destroy_workqueue(mpc_wq_trunc); + kmem_cache_destroy(mpc_xvm_cache[1]); + kmem_cache_destroy(mpc_xvm_cache[0]); +} + +static const struct vm_operations_struct mpc_vops_default = { + .open = mpc_vm_open, + .close = mpc_vm_close, + .fault = mpc_vm_fault, +}; + +const struct address_space_operations mpc_aops_default = { + .readpages = mpc_readpages, + .releasepage = mpc_releasepage, + .invalidatepage = mpc_invalidatepage, + .migratepage = mpc_migratepage, +}; diff --git a/drivers/mpool/mcache.h b/drivers/mpool/mcache.h new file mode 100644 index 000000000000..fe6f45a05494 --- /dev/null +++ b/drivers/mpool/mcache.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2015-2020 Micron Technology, Inc. All rights reserved. + */ + +#ifndef MPOOL_MCACHE_H +#define MPOOL_MCACHE_H + +#include <linux/kref.h> + +#include "init.h" +#include "mblock.h" + +struct mpc_unit; + +/** + * struct mpc_rgnmap - extended vma (xvm) region management + * @rm_lock: protects rm_root + * @rm_root: root of the region map + * @rm_rgncnt; number of active regions + * + * Note that this is not a ref-counted object, its lifetime + * is tied to struct mpc_unit. + */ +struct mpc_rgnmap { + struct mutex rm_lock; + struct idr rm_root; + atomic_t rm_rgncnt; +} ____cacheline_aligned; + + +struct mpc_mbinfo { + struct mblock_descriptor *mbdesc; + u32 mblen; + u32 mbmult; + atomic64_t mbatime; +} __aligned(32); + +struct mpc_xvm { + size_t xvm_bktsz; + uint xvm_mbinfoc; + uint xvm_rgn; + struct kref xvm_ref; + u32 xvm_magic; + struct mpool_descriptor *xvm_mpdesc; + + atomic64_t *xvm_hcpagesp; + struct address_space *xvm_mapping; + struct mpc_rgnmap *xvm_rgnmap; + + enum mpc_vma_advice xvm_advice; + atomic_t xvm_opened; + struct kmem_cache *xvm_cache; + struct mpc_xvm *xvm_next; + + ____cacheline_aligned + atomic64_t xvm_nrpages; + atomic_t xvm_rabusy; + struct work_struct xvm_work; + + ____cacheline_aligned + struct mpc_mbinfo xvm_mbinfov[]; +}; + +extern const struct address_space_operations mpc_aops_default; + +void mpc_rgnmap_flush(struct mpc_rgnmap *rm); + +int mpc_mmap(struct file *fp, struct vm_area_struct *vma); + +int mpc_fadvise(struct file *file, loff_t offset, loff_t len, int advice); + +int mpioc_xvm_create(struct mpc_unit *unit, struct mpool_descriptor *mp, struct mpioc_vma *ioc); + +int mpioc_xvm_destroy(struct mpc_unit *unit, struct mpioc_vma *ioc); + +int mpioc_xvm_purge(struct mpc_unit *unit, struct mpioc_vma *ioc); + +int mpioc_xvm_vrss(struct mpc_unit *unit, struct mpioc_vma *ioc); + +void mpc_xvm_free(struct mpc_xvm *xvm); + +int mcache_init(void) __cold; +void mcache_exit(void) __cold; + +static inline pgoff_t mpc_xvm_pgoff(struct mpc_xvm *xvm) +{ + return ((ulong)xvm->xvm_rgn << xvm_size_max) >> PAGE_SHIFT; +} + +static inline size_t mpc_xvm_pglen(struct mpc_xvm *xvm) +{ + return (xvm->xvm_bktsz * xvm->xvm_mbinfoc) >> PAGE_SHIFT; +} + +#endif /* MPOOL_MCACHE_H */ diff --git a/drivers/mpool/mpctl.c b/drivers/mpool/mpctl.c index 3a231cf982b3..f11f522ec90c 100644 --- a/drivers/mpool/mpctl.c +++ b/drivers/mpool/mpctl.c @@ -142,6 +142,11 @@ static ssize_t mpc_label_show(struct device *dev, struct device_attribute *da, c return scnprintf(buf, PAGE_SIZE, "%s\n", dev_to_unit(dev)->un_label); } +static ssize_t mpc_vma_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", xvm_size_max); +} + static ssize_t mpc_type_show(struct device *dev, struct device_attribute *da, char *buf) { struct mpool_uuid uuid; @@ -160,6 +165,7 @@ static void mpc_mpool_params_add(struct device_attribute *dattr) MPC_ATTR_RO(dattr++, mode); MPC_ATTR_RO(dattr++, ra); MPC_ATTR_RO(dattr++, label); + MPC_ATTR_RO(dattr++, vma); MPC_ATTR_RO(dattr, type); } @@ -243,6 +249,8 @@ static void mpool_params_merge_defaults(struct mpool_params *params) if (params->mp_mode != -1) params->mp_mode &= 0777; + params->mp_vma_size_max = xvm_size_max; + params->mp_rsvd0 = 0; params->mp_rsvd1 = 0; params->mp_rsvd2 = 0; @@ -382,6 +390,10 @@ static int mpc_unit_create(const char *name, struct mpc_mpool *mpool, struct mpc kref_init(&unit->un_ref); unit->un_mpool = mpool; + mutex_init(&unit->un_rgnmap.rm_lock); + idr_init(&unit->un_rgnmap.rm_root); + atomic_set(&unit->un_rgnmap.rm_rgncnt, 0); + mutex_lock(&ss->ss_lock); minor = idr_alloc(&ss->ss_unitmap, NULL, 0, -1, GFP_KERNEL); mutex_unlock(&ss->ss_lock); @@ -421,6 +433,8 @@ static void mpc_unit_release(struct kref *refp) if (unit->un_device) device_destroy(ss->ss_class, unit->un_devno); + idr_destroy(&unit->un_rgnmap.rm_root); + kfree(unit); } @@ -633,6 +647,7 @@ static int mpc_cf_journal(struct mpc_unit *unit) cfg.mc_oid2 = unit->un_ds_oidv[1]; cfg.mc_captgt = unit->un_mdc_captgt; cfg.mc_ra_pages_max = unit->un_ra_pages_max; + cfg.mc_vma_size_max = xvm_size_max; memcpy(&cfg.mc_utype, &unit->un_utype, sizeof(cfg.mc_utype)); strlcpy(cfg.mc_label, unit->un_label, sizeof(cfg.mc_label)); @@ -742,6 +757,7 @@ static int mpioc_params_get(struct mpc_unit *unit, struct mpioc_params *get) params->mp_oidv[0] = unit->un_ds_oidv[0]; params->mp_oidv[1] = unit->un_ds_oidv[1]; params->mp_ra_pages_max = unit->un_ra_pages_max; + params->mp_vma_size_max = xvm_size_max; memcpy(&params->mp_utype, &unit->un_utype, sizeof(params->mp_utype)); strlcpy(params->mp_label, unit->un_label, sizeof(params->mp_label)); strlcpy(params->mp_name, unit->un_name, sizeof(params->mp_name)); @@ -785,6 +801,8 @@ static int mpioc_params_set(struct mpc_unit *unit, struct mpioc_params *set) params = &set->mps_params; + params->mp_vma_size_max = xvm_size_max; + mutex_lock(&ss->ss_lock); if (params->mp_uid != -1 || params->mp_gid != -1 || params->mp_mode != -1) { err = mpc_mp_chown(unit, params); @@ -919,6 +937,7 @@ static void mpioc_prop_get(struct mpc_unit *unit, struct mpioc_prop *kprop) params->mp_oidv[0] = unit->un_ds_oidv[0]; params->mp_oidv[1] = unit->un_ds_oidv[1]; params->mp_ra_pages_max = unit->un_ra_pages_max; + params->mp_vma_size_max = xvm_size_max; memcpy(&params->mp_utype, &unit->un_utype, sizeof(params->mp_utype)); strlcpy(params->mp_label, unit->un_label, sizeof(params->mp_label)); strlcpy(params->mp_name, unit->un_name, sizeof(params->mp_name)); @@ -1171,6 +1190,7 @@ static int mpioc_mp_create(struct mpc_unit *ctl, struct mpioc_mpool *mp, cfg.mc_rsvd0 = mp->mp_params.mp_rsvd0; cfg.mc_captgt = MPOOL_ROOT_LOG_CAP; cfg.mc_ra_pages_max = mp->mp_params.mp_ra_pages_max; + cfg.mc_vma_size_max = mp->mp_params.mp_vma_size_max; cfg.mc_rsvd1 = mp->mp_params.mp_rsvd1; cfg.mc_rsvd2 = mp->mp_params.mp_rsvd2; cfg.mc_rsvd3 = mp->mp_params.mp_rsvd3; @@ -1300,6 +1320,7 @@ static int mpioc_mp_activate(struct mpc_unit *ctl, struct mpioc_mpool *mp, mp->mp_params.mp_oidv[0] = cfg.mc_oid1; mp->mp_params.mp_oidv[1] = cfg.mc_oid2; mp->mp_params.mp_ra_pages_max = cfg.mc_ra_pages_max; + mp->mp_params.mp_vma_size_max = cfg.mc_vma_size_max; memcpy(&mp->mp_params.mp_utype, &cfg.mc_utype, sizeof(mp->mp_params.mp_utype)); strlcpy(mp->mp_params.mp_label, cfg.mc_label, sizeof(mp->mp_params.mp_label)); @@ -2313,6 +2334,7 @@ static int mpc_open(struct inode *ip, struct file *fp) } fp->f_op = &mpc_fops_default; + fp->f_mapping->a_ops = &mpc_aops_default; unit->un_mapping = fp->f_mapping; @@ -2361,8 +2383,11 @@ static int mpc_release(struct inode *ip, struct file *fp) if (!lastclose) goto errout; - if (mpc_unit_ismpooldev(unit)) + if (mpc_unit_ismpooldev(unit)) { + mpc_rgnmap_flush(&unit->un_rgnmap); + unit->un_mapping = NULL; + } unit->un_open_excl = false; @@ -2530,6 +2555,22 @@ static long mpc_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) rc = mpioc_mlog_erase(unit, argp); break; + case MPIOC_VMA_CREATE: + rc = mpioc_xvm_create(unit, unit->un_mpool->mp_desc, argp); + break; + + case MPIOC_VMA_DESTROY: + rc = mpioc_xvm_destroy(unit, argp); + break; + + case MPIOC_VMA_PURGE: + rc = mpioc_xvm_purge(unit, argp); + break; + + case MPIOC_VMA_VRSS: + rc = mpioc_xvm_vrss(unit, argp); + break; + default: rc = -ENOTTY; mp_pr_rl("invalid command %x: dir=%u type=%c nr=%u size=%u", @@ -2553,6 +2594,8 @@ static const struct file_operations mpc_fops_default = { .open = mpc_open, .release = mpc_release, .unlocked_ioctl = mpc_ioctl, + .mmap = mpc_mmap, + .fadvise = mpc_fadvise, }; static int mpc_exit_unit(int minor, void *item, void *arg) diff --git a/drivers/mpool/mpctl.h b/drivers/mpool/mpctl.h index 412a6a491c15..b93e44248f03 100644 --- a/drivers/mpool/mpctl.h +++ b/drivers/mpool/mpctl.h @@ -11,6 +11,8 @@ #include <linux/device.h> #include <linux/semaphore.h> +#include "mcache.h" + #define ITERCB_NEXT (0) #define ITERCB_DONE (1) @@ -23,6 +25,7 @@ struct mpc_unit { uid_t un_uid; gid_t un_gid; mode_t un_mode; + struct mpc_rgnmap un_rgnmap; dev_t un_devno; const struct mpc_uinfo *un_uinfo; struct mpc_mpool *un_mpool;

[v2,19/22] mpool: add support to mmap arbitrary collection of mblocks

Commit Message

Patch