diff mbox

[RFC,contig,pages,support,1/2] IB: Supports contiguous memory operations

Message ID 1449587707-24214-2-git-send-email-yishaih@mellanox.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Yishai Hadas Dec. 8, 2015, 3:15 p.m. UTC
New structure 'cmem' represents the contiguous allocated memory.
It supports:
Allocate, Free, 'Map to virtual address' operations, etc.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
---
 drivers/infiniband/core/Makefile |   2 +-
 drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_cmem.h           |  41 +++++++
 3 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/cmem.c
 create mode 100644 include/rdma/ib_cmem.h

Comments

Christoph Hellwig Dec. 8, 2015, 3:18 p.m. UTC | #1
There is absolutely nothing IB specific here.  If you want to support
anonymous mmaps to allocate large contiguous pages work with the MM
folks on providing that in a generic fashion.

[full quote alert for reference:]

On Tue, Dec 08, 2015 at 05:15:06PM +0200, Yishai Hadas wrote:
> New structure 'cmem' represents the contiguous allocated memory.
> It supports:
> Allocate, Free, 'Map to virtual address' operations, etc.
> 
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> ---
>  drivers/infiniband/core/Makefile |   2 +-
>  drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_cmem.h           |  41 +++++++
>  3 files changed, 287 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/core/cmem.c
>  create mode 100644 include/rdma/ib_cmem.h
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d43a899..8549ea4 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
>  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o
> -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
>  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
>  
>  ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
> diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
> new file mode 100644
> index 0000000..21d8573
> --- /dev/null
> +++ b/drivers/infiniband/core/cmem.c
> @@ -0,0 +1,245 @@
> +#include <linux/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/sched.h>
> +#include <linux/export.h>
> +#include <linux/dma-attrs.h>
> +#include <linux/slab.h>
> +#include <rdma/ib_cmem.h>
> +#include "uverbs.h"
> +
> +static void ib_cmem_release(struct kref *ref)
> +{
> +	struct ib_cmem *cmem;
> +	struct ib_cmem_block *cmem_block, *tmp;
> +	unsigned long ntotal_pages;
> +
> +	cmem = container_of(ref, struct ib_cmem, refcount);
> +
> +	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
> +		__free_pages(cmem_block->page, cmem->block_order);
> +		list_del(&cmem_block->list);
> +		kfree(cmem_block);
> +	}
> +	/* no locking is needed:
> +	  * ib_cmem_release is called from vm_close which is always called
> +	  * with mm->mmap_sem held for writing.
> +	  * The only exception is when the process shutting down but in that case
> +	  * counter not relevant any more.
> +	  */
> +	if (current->mm) {
> +		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
> +		current->mm->pinned_vm -= ntotal_pages;
> +	}
> +	kfree(cmem);
> +}
> +
> +/**
> + * ib_cmem_release_contiguous_pages - release memory allocated by
> + *                                              ib_cmem_alloc_contiguous_pages.
> + * @cmem: cmem struct to release
> + */
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
> +{
> +	kref_put(&cmem->refcount, ib_cmem_release);
> +}
> +EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
> +
> +static void cmem_vma_open(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *ib_cmem;
> +
> +	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	/* vm_open and vm_close are always called with mm->mmap_sem held for
> +	  * writing. The only exception is when the process is shutting down, at
> +	  * which point vm_close is called with no locks held, but since it is
> +	  * after the VMAs have been detached, it is impossible that vm_open will
> +	  * be called. Therefore, there is no need to synchronize the kref_get and
> +	  * kref_put calls.
> +	*/
> +	kref_get(&ib_cmem->refcount);
> +}
> +
> +static void cmem_vma_close(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *cmem;
> +
> +	cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	ib_cmem_release_contiguous_pages(cmem);
> +}
> +
> +static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
> +	.open = cmem_vma_open,
> +	.close = cmem_vma_close
> +};
> +
> +/**
> + * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
> + * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
> + * @vma: VMA to inject pages into.
> + */
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma)
> +{
> +	int ret;
> +	unsigned long page_entry;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontig_pages;
> +	unsigned long total_size;
> +	struct page *page;
> +	unsigned long vma_entry_number = 0;
> +	struct ib_cmem_block *ib_cmem_block = NULL;
> +
> +	total_size = vma->vm_end - vma->vm_start;
> +	if (ib_cmem->length != total_size)
> +		return -EINVAL;
> +
> +	if (total_size != PAGE_ALIGN(total_size)) {
> +		WARN(1,
> +		     "ib_cmem_map: total size %lu not aligned to page size\n",
> +		     total_size);
> +		return -EINVAL;
> +	}
> +
> +	ntotal_pages = total_size >> PAGE_SHIFT;
> +	ncontig_pages = 1 << ib_cmem->block_order;
> +
> +	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
> +		page = ib_cmem_block->page;
> +		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
> +			/* We reached end of vma - going out from both loops */
> +			if (vma_entry_number >= ntotal_pages)
> +				goto end;
> +
> +			ret = vm_insert_page(vma, vma->vm_start +
> +				(vma_entry_number << PAGE_SHIFT), page);
> +			if (ret < 0)
> +				goto err_vm_insert;
> +
> +			vma_entry_number++;
> +			page++;
> +		}
> +	}
> +
> +end:
> +
> +	/* We expect to have enough pages   */
> +	if (vma_entry_number >= ntotal_pages) {
> +		vma->vm_ops =  &cmem_contig_pages_vm_ops;
> +		vma->vm_private_data = ib_cmem;
> +		return 0;
> +	}
> +	/* Not expected but if we reached here
> +	  * not enough contiguous pages were registered
> +	  */
> +	ret = -EINVAL;
> +
> +err_vm_insert:
> +
> +	zap_vma_ptes(vma, vma->vm_start, total_size);
> +	return ret;
> +}
> +EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
> +
> +/**
> + * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
> + * @context: userspace context to allocate memory for
> + * @total_size: total required size for that allocation.
> + * @page_size_order: order of one contiguous page.
> + * @numa_nude: From which numa node to allocate memory
> + *             when numa_nude < 0 use default numa_nude.
> + */
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node)
> +{
> +	struct ib_cmem *cmem;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontiguous_pages;
> +	unsigned long ncontiguous_groups;
> +	struct page *page;
> +	int i;
> +	int ncontiguous_pages_order;
> +	struct ib_cmem_block *ib_cmem_block;
> +	unsigned long locked;
> +	unsigned long lock_limit;
> +
> +	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
> +		return ERR_PTR(-EINVAL);
> +
> +	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
> +	if (!cmem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&cmem->refcount);
> +	cmem->context   = context;
> +	INIT_LIST_HEAD(&cmem->ib_cmem_block);
> +
> +	/* Total size is expected to be already page aligned -
> +	  * verifying anyway.
> +	  */
> +	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
> +	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
> +	  * with mm->mmap_sem held for writing.
> +	  * No need to lock
> +	  */
> +	locked     = ntotal_pages + current->mm->pinned_vm;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
> +		goto err_alloc;
> +
> +	/* How many contiguous pages do we need in 1 block */
> +	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
> +	ncontiguous_pages_order = ilog2(ncontiguous_pages);
> +	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
> +		(!!(ntotal_pages & (ncontiguous_pages - 1)));
> +
> +	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
> +	if (ncontiguous_pages_order >= MAX_ORDER)
> +		goto err_alloc;
> +	/* we set block_order before starting allocation to prevent
> +	  * a leak in a failure flow in ib_cmem_release.
> +	  * cmem->length has at that step value 0 from kzalloc as expected
> +	  */
> +	cmem->block_order = ncontiguous_pages_order;
> +	for (i = 0; i < ncontiguous_groups; i++) {
> +		/* Allocating the managed entry */
> +		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
> +					GFP_KERNEL);
> +		if (!ib_cmem_block)
> +			goto err_alloc;
> +
> +		if (numa_node < 0)
> +			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
> +					    __GFP_COMP | __GFP_NOWARN,
> +					    ncontiguous_pages_order);
> +		else
> +			page =  alloc_pages_node(numa_node,
> +						 GFP_HIGHUSER | __GFP_ZERO |
> +						 __GFP_COMP | __GFP_NOWARN,
> +						 ncontiguous_pages_order);
> +
> +		if (!page) {
> +			kfree(ib_cmem_block);
> +			/* We should deallocate previous succeeded allocatations
> +			  * if exists.
> +			  */
> +			goto err_alloc;
> +		}
> +
> +		ib_cmem_block->page = page;
> +		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
> +	}
> +
> +	cmem->length = total_size;
> +	current->mm->pinned_vm = locked;
> +	return cmem;
> +
> +err_alloc:
> +	ib_cmem_release_contiguous_pages(cmem);
> +	return ERR_PTR(-ENOMEM);
> +}
> +EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
> diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
> new file mode 100644
> index 0000000..5f26a49
> --- /dev/null
> +++ b/include/rdma/ib_cmem.h
> @@ -0,0 +1,41 @@
> +#ifndef IB_CMEM_H
> +#define IB_CMEM_H
> +
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +/* contiguous memory structure */
> +struct ib_cmem {
> +	struct ib_ucontext     *context;
> +	size_t			length;
> +	/* Link list of contiguous blocks being part of that cmem  */
> +	struct list_head ib_cmem_block;
> +
> +	/* Order of cmem block,  2^ block_order will equal number
> +	  * of physical pages per block
> +	  */
> +	unsigned long    block_order;
> +	/* Refernce counter for that memory area
> +	  * When value became 0 pages will be returned to the kernel.
> +	  */
> +	struct kref refcount;
> +};
> +
> +struct ib_cmem_block {
> +	struct list_head	list;
> +	/* page will point to the page struct of the head page
> +	  * in the current compound page.
> +	  * block order is saved once as part of ib_cmem.
> +	  */
> +	struct page            *page;
> +};
> +
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma);
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node);
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
> +
> +#endif
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Dec. 8, 2015, 5:15 p.m. UTC | #2
On Tue, Dec 08, 2015 at 07:18:52AM -0800, Christoph Hellwig wrote:
> There is absolutely nothing IB specific here.  If you want to support
> anonymous mmaps to allocate large contiguous pages work with the MM
> folks on providing that in a generic fashion.

Yes please.

We already have huge page mmaps, how much win is had by going from
huge page maps to this contiguous map?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shachar Raindel Dec. 9, 2015, 10 a.m. UTC | #3
> -----Original Message-----
> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
> Behalf Of Jason Gunthorpe
> Sent: Tuesday, December 08, 2015 7:16 PM
> To: Christoph Hellwig <hch@infradead.org>
> Cc: Yishai Hadas <yishaih@mellanox.com>; dledford@redhat.com; linux-
> rdma@vger.kernel.org; Or Gerlitz <ogerlitz@mellanox.com>; Tal Alon
> <talal@mellanox.com>; linux-mm@kvack.org
> Subject: Re: [RFC contig pages support 1/2] IB: Supports contiguous
> memory operations
> 
> On Tue, Dec 08, 2015 at 07:18:52AM -0800, Christoph Hellwig wrote:
> > There is absolutely nothing IB specific here.  If you want to support
> > anonymous mmaps to allocate large contiguous pages work with the MM
> > folks on providing that in a generic fashion.
> 
> Yes please.
> 

Note that other HW vendors are developing similar solutions, see for example: http://www.slideshare.net/linaroorg/hkg15106-replacing-cmem-meeting-tis-soc-shared-buffer-allocation-management-and-address-translation-requirements

> We already have huge page mmaps, how much win is had by going from
> huge page maps to this contiguous map?
> 

As far as gain is concerned, we are seeing gains in two cases here:
1. If the system has lots of non-fragmented, free memory, you can create large contig blocks that are above the CPU huge page size.
2. If the system memory is very fragmented, you cannot allocate huge pages. However, an API that allows you to create small (i.e. 64KB, 128KB, etc.) contig blocks reduces the load on the HW page tables and caches.

Thanks,
--Shachar
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Dec. 9, 2015, 5:48 p.m. UTC | #4
On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> > Yes please.

> Note that other HW vendors are developing similar solutions, see for
> example:
> http://www.slideshare.net/linaroorg/hkg15106-replacing-cmem-meeting-tis-soc-shared-buffer-allocation-management-and-address-translation-requirements

CMA and it's successors are for something totally different.

> > We already have huge page mmaps, how much win is had by going from
> > huge page maps to this contiguous map?
> 
> As far as gain is concerned, we are seeing gains in two cases here:
> 1. If the system has lots of non-fragmented, free memory, you can
> create large contig blocks that are above the CPU huge page size.
> 2. If the system memory is very fragmented, you cannot allocate huge
> pages. However, an API that allows you to create small (i.e. 64KB,
> 128KB, etc.) contig blocks reduces the load on the HW page tables
> and caches.

I understand what it does, I was looking for performance numbers. The
last time I trivially benchmarked huge pages vs not huge pages on mlx4
I wasn't able to detect a performance difference.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Dec. 9, 2015, 6:39 p.m. UTC | #5
On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> As far as gain is concerned, we are seeing gains in two cases here:
> 1. If the system has lots of non-fragmented, free memory, you can create large contig blocks that are above the CPU huge page size.
> 2. If the system memory is very fragmented, you cannot allocate huge pages. However, an API that allows you to create small (i.e. 64KB, 128KB, etc.) contig blocks reduces the load on the HW page tables and caches.

None of that is a uniqueue requirement for the mlx4 devices.  Again,
please work with the memory management folks to address your
requirements in a generic way!
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shachar Raindel Dec. 13, 2015, 12:48 p.m. UTC | #6
> -----Original Message-----
> From: Christoph Hellwig [mailto:hch@infradead.org]
> Sent: Wednesday, December 09, 2015 8:40 PM
> 
> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> > As far as gain is concerned, we are seeing gains in two cases here:
> > 1. If the system has lots of non-fragmented, free memory, you can
> create large contig blocks that are above the CPU huge page size.
> > 2. If the system memory is very fragmented, you cannot allocate huge
> pages. However, an API that allows you to create small (i.e. 64KB,
> 128KB, etc.) contig blocks reduces the load on the HW page tables and
> caches.
> 
> None of that is a uniqueue requirement for the mlx4 devices.  Again,
> please work with the memory management folks to address your
> requirements in a generic way!

I completely agree, and this RFC was sent in order to start discussion
on this subject.

Dear MM people, can you please advise on the subject?

Multiple HW vendors, from different fields, ranging between embedded SoC
devices (TI) and HPC (Mellanox) are looking for a solution to allocate
blocks of contiguous memory to user space applications, without using huge
pages.

What should be the API to expose such feature? 

Should we create a virtual FS that allows the user to create "files"
representing memory allocations, and define the contiguous level we
attempt to allocate using folders (similar to hugetlbfs)?

Should we patch hugetlbfs to allow allocation of contiguous memory chunks,
without creating larger memory mapping in the CPU page tables?

Should we create a special "allocator" virtual device, that will hand out
memory in contiguous chunks via a call to mmap with an FD connected to the
device?

Thanks,
--Shachar


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vlastimil Babka Dec. 22, 2015, 2:59 p.m. UTC | #7
On 12/13/2015 01:48 PM, Shachar Raindel wrote:
>
>
>> -----Original Message-----
>> From: Christoph Hellwig [mailto:hch@infradead.org]
>> Sent: Wednesday, December 09, 2015 8:40 PM
>>
>> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
>>> As far as gain is concerned, we are seeing gains in two cases here:
>>> 1. If the system has lots of non-fragmented, free memory, you can
>> create large contig blocks that are above the CPU huge page size.
>>> 2. If the system memory is very fragmented, you cannot allocate huge
>> pages. However, an API that allows you to create small (i.e. 64KB,
>> 128KB, etc.) contig blocks reduces the load on the HW page tables and
>> caches.
>>
>> None of that is a uniqueue requirement for the mlx4 devices.  Again,
>> please work with the memory management folks to address your
>> requirements in a generic way!
>
> I completely agree, and this RFC was sent in order to start discussion
> on this subject.
>
> Dear MM people, can you please advise on the subject?
>
> Multiple HW vendors, from different fields, ranging between embedded SoC
> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
> blocks of contiguous memory to user space applications, without using huge
> pages.
>
> What should be the API to expose such feature?
>
> Should we create a virtual FS that allows the user to create "files"
> representing memory allocations, and define the contiguous level we
> attempt to allocate using folders (similar to hugetlbfs)?
>
> Should we patch hugetlbfs to allow allocation of contiguous memory chunks,
> without creating larger memory mapping in the CPU page tables?
>
> Should we create a special "allocator" virtual device, that will hand out
> memory in contiguous chunks via a call to mmap with an FD connected to the
> device?

How much memory do you assume to be used like this? Is this memory 
supposed to be swappable, migratable, etc? I.e. on LRU lists?
Allocating a lot of memory (e.g. most of userspace memory) that's not 
LRU wouldn't be nice. But LRU operations are not prepared to work witch 
such non-standard-sized allocations, regardless of what API you use.  So 
I think that's the more fundamental questions here.

> Thanks,
> --Shachar
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shachar Raindel Dec. 23, 2015, 4:30 p.m. UTC | #8
> -----Original Message-----
> From: Vlastimil Babka [mailto:vbabka@suse.cz]
> Sent: Tuesday, December 22, 2015 4:59 PM
> 
> On 12/13/2015 01:48 PM, Shachar Raindel wrote:
> >
> >
> >> -----Original Message-----
> >> From: Christoph Hellwig [mailto:hch@infradead.org]
> >> Sent: Wednesday, December 09, 2015 8:40 PM
> >>
> >> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> >>> As far as gain is concerned, we are seeing gains in two cases here:
> >>> 1. If the system has lots of non-fragmented, free memory, you can
> >> create large contig blocks that are above the CPU huge page size.
> >>> 2. If the system memory is very fragmented, you cannot allocate huge
> >> pages. However, an API that allows you to create small (i.e. 64KB,
> >> 128KB, etc.) contig blocks reduces the load on the HW page tables and
> >> caches.
> >>
> >> None of that is a uniqueue requirement for the mlx4 devices.  Again,
> >> please work with the memory management folks to address your
> >> requirements in a generic way!
> >
> > I completely agree, and this RFC was sent in order to start discussion
> > on this subject.
> >
> > Dear MM people, can you please advise on the subject?
> >
> > Multiple HW vendors, from different fields, ranging between embedded
> SoC
> > devices (TI) and HPC (Mellanox) are looking for a solution to allocate
> > blocks of contiguous memory to user space applications, without using
> huge
> > pages.
> >
> > What should be the API to expose such feature?
> >
> > Should we create a virtual FS that allows the user to create "files"
> > representing memory allocations, and define the contiguous level we
> > attempt to allocate using folders (similar to hugetlbfs)?
> >
> > Should we patch hugetlbfs to allow allocation of contiguous memory
> chunks,
> > without creating larger memory mapping in the CPU page tables?
> >
> > Should we create a special "allocator" virtual device, that will hand
> out
> > memory in contiguous chunks via a call to mmap with an FD connected to
> the
> > device?
> 
> How much memory do you assume to be used like this?

Depends on the use case. Most likely several MBs/core, used for interfacing
with the HW (packet rings, frame buffers, etc.).

Some applications might want to perform calculations in such memory, to 
optimize communication time, especially in the HPC market.

> Is this memory
> supposed to be swappable, migratable, etc? I.e. on LRU lists?

Most likely not. In many of the relevant applications (embedded, HPC),
there is no swap and the application threads are pinned to specific cores
and NUMA nodes.
The biggest pain here is that these memory pages will not be eligible for
compaction, making it harder to handle fragmentations and CMA allocation
requests.

> Allocating a lot of memory (e.g. most of userspace memory) that's not
> LRU wouldn't be nice. But LRU operations are not prepared to work witch
> such non-standard-sized allocations, regardless of what API you use.  So
> I think that's the more fundamental questions here.

I agree that there are fundamental questions here. 

That being said, there is a clear need for an API allowing 
allocation, to the user space, limited size of memory that
is composed of large contiguous blocks.

What will be the best way to implement such solution?

Thanks,
--Shachar

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vlastimil Babka Jan. 4, 2016, 2:43 p.m. UTC | #9
On 12/23/2015 05:30 PM, Shachar Raindel wrote:
 >>>
 >>> I completely agree, and this RFC was sent in order to start discussion
 >>> on this subject.
 >>>
 >>> Dear MM people, can you please advise on the subject?
 >>>
 >>> Multiple HW vendors, from different fields, ranging between embedded
 >> SoC
 >>> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
 >>> blocks of contiguous memory to user space applications, without using
 >> huge
 >>> pages.
 >>>
 >>> What should be the API to expose such feature?
 >>>
 >>> Should we create a virtual FS that allows the user to create "files"
 >>> representing memory allocations, and define the contiguous level we
 >>> attempt to allocate using folders (similar to hugetlbfs)?
 >>>
 >>> Should we patch hugetlbfs to allow allocation of contiguous memory
 >> chunks,
 >>> without creating larger memory mapping in the CPU page tables?
 >>>
 >>> Should we create a special "allocator" virtual device, that will hand
 >> out
 >>> memory in contiguous chunks via a call to mmap with an FD connected to
 >> the
 >>> device?
 >>
 >> How much memory do you assume to be used like this?
 >
 > Depends on the use case. Most likely several MBs/core, used for 
interfacing
 > with the HW (packet rings, frame buffers, etc.).
 >
 > Some applications might want to perform calculations in such memory, to
 > optimize communication time, especially in the HPC market.

OK.

 >
 >> Is this memory
 >> supposed to be swappable, migratable, etc? I.e. on LRU lists?
 >
 > Most likely not. In many of the relevant applications (embedded, HPC),
 > there is no swap and the application threads are pinned to specific cores
 > and NUMA nodes.
 > The biggest pain here is that these memory pages will not be eligible for
 > compaction, making it harder to handle fragmentations and CMA allocation
 > requests.

There was a patch set to enable compaction on such pages, see 
https://lwn.net/Articles/650917/
Minchan was going to pick this after Gioh left, and then it should be 
possible. But it requires careful driver-specific cooperation, i.e. when 
a page can be isolated for the migration, see 
http://article.gmane.org/gmane.linux.kernel.mm/136457

 >> Allocating a lot of memory (e.g. most of userspace memory) that's not
 >> LRU wouldn't be nice. But LRU operations are not prepared to work witch
 >> such non-standard-sized allocations, regardless of what API you use.  So
 >> I think that's the more fundamental questions here.
 >
 > I agree that there are fundamental questions here.
 >
 > That being said, there is a clear need for an API allowing
 > allocation, to the user space, limited size of memory that
 > is composed of large contiguous blocks.
 >
 > What will be the best way to implement such solution?

Given the likely driver-specific constraints/handling of the page 
migration, I'm not sure if some completely universal API is feasible.
Maybe some reusable parts of the functionality in the patch in this 
thread could be provided by mm.

 > Thanks,
 > --Shachar
 >
 > --
 > To unsubscribe, send a message with 'unsubscribe linux-mm' in
 > the body to majordomo@kvack.org.  For more info on Linux MM,
 > see: http://www.linux-mm.org/ .
 > Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>
 >

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vlastimil Babka Jan. 4, 2016, 2:44 p.m. UTC | #10
[Sorry for resending, forgot to CC Minchan]

On 12/23/2015 05:30 PM, Shachar Raindel wrote:
 >>>
 >>> I completely agree, and this RFC was sent in order to start discussion
 >>> on this subject.
 >>>
 >>> Dear MM people, can you please advise on the subject?
 >>>
 >>> Multiple HW vendors, from different fields, ranging between embedded
 >> SoC
 >>> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
 >>> blocks of contiguous memory to user space applications, without using
 >> huge
 >>> pages.
 >>>
 >>> What should be the API to expose such feature?
 >>>
 >>> Should we create a virtual FS that allows the user to create "files"
 >>> representing memory allocations, and define the contiguous level we
 >>> attempt to allocate using folders (similar to hugetlbfs)?
 >>>
 >>> Should we patch hugetlbfs to allow allocation of contiguous memory
 >> chunks,
 >>> without creating larger memory mapping in the CPU page tables?
 >>>
 >>> Should we create a special "allocator" virtual device, that will hand
 >> out
 >>> memory in contiguous chunks via a call to mmap with an FD connected to
 >> the
 >>> device?
 >>
 >> How much memory do you assume to be used like this?
 >
 > Depends on the use case. Most likely several MBs/core, used for 
interfacing
 > with the HW (packet rings, frame buffers, etc.).
 >
 > Some applications might want to perform calculations in such memory, to
 > optimize communication time, especially in the HPC market.

OK.

 >
 >> Is this memory
 >> supposed to be swappable, migratable, etc? I.e. on LRU lists?
 >
 > Most likely not. In many of the relevant applications (embedded, HPC),
 > there is no swap and the application threads are pinned to specific cores
 > and NUMA nodes.
 > The biggest pain here is that these memory pages will not be eligible for
 > compaction, making it harder to handle fragmentations and CMA allocation
 > requests.

There was a patch set to enable compaction on such pages, see 
https://lwn.net/Articles/650917/
Minchan was going to pick this after Gioh left, and then it should be 
possible. But it requires careful driver-specific cooperation, i.e. when 
a page can be isolated for the migration, see 
http://article.gmane.org/gmane.linux.kernel.mm/136457

 >> Allocating a lot of memory (e.g. most of userspace memory) that's not
 >> LRU wouldn't be nice. But LRU operations are not prepared to work witch
 >> such non-standard-sized allocations, regardless of what API you use.  So
 >> I think that's the more fundamental questions here.
 >
 > I agree that there are fundamental questions here.
 >
 > That being said, there is a clear need for an API allowing
 > allocation, to the user space, limited size of memory that
 > is composed of large contiguous blocks.
 >
 > What will be the best way to implement such solution?

Given the likely driver-specific constraints/handling of the page 
migration, I'm not sure if some completely universal API is feasible.
Maybe some reusable parts of the functionality in the patch in this 
thread could be provided by mm.

 > Thanks,
 > --Shachar
 >
 > --
 > To unsubscribe, send a message with 'unsubscribe linux-mm' in
 > the body to majordomo@kvack.org.  For more info on Linux MM,
 > see: http://www.linux-mm.org/ .
 > Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>
 >

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a899..8549ea4 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@  obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
 ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
new file mode 100644
index 0000000..21d8573
--- /dev/null
+++ b/drivers/infiniband/core/cmem.c
@@ -0,0 +1,245 @@ 
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+#include <rdma/ib_cmem.h>
+#include "uverbs.h"
+
+static void ib_cmem_release(struct kref *ref)
+{
+	struct ib_cmem *cmem;
+	struct ib_cmem_block *cmem_block, *tmp;
+	unsigned long ntotal_pages;
+
+	cmem = container_of(ref, struct ib_cmem, refcount);
+
+	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
+		__free_pages(cmem_block->page, cmem->block_order);
+		list_del(&cmem_block->list);
+		kfree(cmem_block);
+	}
+	/* no locking is needed:
+	  * ib_cmem_release is called from vm_close which is always called
+	  * with mm->mmap_sem held for writing.
+	  * The only exception is when the process shutting down but in that case
+	  * counter not relevant any more.
+	  */
+	if (current->mm) {
+		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
+		current->mm->pinned_vm -= ntotal_pages;
+	}
+	kfree(cmem);
+}
+
+/**
+ * ib_cmem_release_contiguous_pages - release memory allocated by
+ *                                              ib_cmem_alloc_contiguous_pages.
+ * @cmem: cmem struct to release
+ */
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
+{
+	kref_put(&cmem->refcount, ib_cmem_release);
+}
+EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
+
+static void cmem_vma_open(struct vm_area_struct *area)
+{
+	struct ib_cmem *ib_cmem;
+
+	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
+
+	/* vm_open and vm_close are always called with mm->mmap_sem held for
+	  * writing. The only exception is when the process is shutting down, at
+	  * which point vm_close is called with no locks held, but since it is
+	  * after the VMAs have been detached, it is impossible that vm_open will
+	  * be called. Therefore, there is no need to synchronize the kref_get and
+	  * kref_put calls.
+	*/
+	kref_get(&ib_cmem->refcount);
+}
+
+static void cmem_vma_close(struct vm_area_struct *area)
+{
+	struct ib_cmem *cmem;
+
+	cmem = (struct ib_cmem *)(area->vm_private_data);
+
+	ib_cmem_release_contiguous_pages(cmem);
+}
+
+static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
+	.open = cmem_vma_open,
+	.close = cmem_vma_close
+};
+
+/**
+ * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
+ * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
+ * @vma: VMA to inject pages into.
+ */
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+					struct vm_area_struct *vma)
+{
+	int ret;
+	unsigned long page_entry;
+	unsigned long ntotal_pages;
+	unsigned long ncontig_pages;
+	unsigned long total_size;
+	struct page *page;
+	unsigned long vma_entry_number = 0;
+	struct ib_cmem_block *ib_cmem_block = NULL;
+
+	total_size = vma->vm_end - vma->vm_start;
+	if (ib_cmem->length != total_size)
+		return -EINVAL;
+
+	if (total_size != PAGE_ALIGN(total_size)) {
+		WARN(1,
+		     "ib_cmem_map: total size %lu not aligned to page size\n",
+		     total_size);
+		return -EINVAL;
+	}
+
+	ntotal_pages = total_size >> PAGE_SHIFT;
+	ncontig_pages = 1 << ib_cmem->block_order;
+
+	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
+		page = ib_cmem_block->page;
+		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
+			/* We reached end of vma - going out from both loops */
+			if (vma_entry_number >= ntotal_pages)
+				goto end;
+
+			ret = vm_insert_page(vma, vma->vm_start +
+				(vma_entry_number << PAGE_SHIFT), page);
+			if (ret < 0)
+				goto err_vm_insert;
+
+			vma_entry_number++;
+			page++;
+		}
+	}
+
+end:
+
+	/* We expect to have enough pages   */
+	if (vma_entry_number >= ntotal_pages) {
+		vma->vm_ops =  &cmem_contig_pages_vm_ops;
+		vma->vm_private_data = ib_cmem;
+		return 0;
+	}
+	/* Not expected but if we reached here
+	  * not enough contiguous pages were registered
+	  */
+	ret = -EINVAL;
+
+err_vm_insert:
+
+	zap_vma_ptes(vma, vma->vm_start, total_size);
+	return ret;
+}
+EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
+
+/**
+ * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
+ * @context: userspace context to allocate memory for
+ * @total_size: total required size for that allocation.
+ * @page_size_order: order of one contiguous page.
+ * @numa_nude: From which numa node to allocate memory
+ *             when numa_nude < 0 use default numa_nude.
+ */
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+					       unsigned long total_size,
+					       unsigned long page_size_order,
+					       int numa_node)
+{
+	struct ib_cmem *cmem;
+	unsigned long ntotal_pages;
+	unsigned long ncontiguous_pages;
+	unsigned long ncontiguous_groups;
+	struct page *page;
+	int i;
+	int ncontiguous_pages_order;
+	struct ib_cmem_block *ib_cmem_block;
+	unsigned long locked;
+	unsigned long lock_limit;
+
+	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
+		return ERR_PTR(-EINVAL);
+
+	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
+	if (!cmem)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&cmem->refcount);
+	cmem->context   = context;
+	INIT_LIST_HEAD(&cmem->ib_cmem_block);
+
+	/* Total size is expected to be already page aligned -
+	  * verifying anyway.
+	  */
+	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
+	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
+	  * with mm->mmap_sem held for writing.
+	  * No need to lock
+	  */
+	locked     = ntotal_pages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+		goto err_alloc;
+
+	/* How many contiguous pages do we need in 1 block */
+	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
+	ncontiguous_pages_order = ilog2(ncontiguous_pages);
+	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
+		(!!(ntotal_pages & (ncontiguous_pages - 1)));
+
+	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
+	if (ncontiguous_pages_order >= MAX_ORDER)
+		goto err_alloc;
+	/* we set block_order before starting allocation to prevent
+	  * a leak in a failure flow in ib_cmem_release.
+	  * cmem->length has at that step value 0 from kzalloc as expected
+	  */
+	cmem->block_order = ncontiguous_pages_order;
+	for (i = 0; i < ncontiguous_groups; i++) {
+		/* Allocating the managed entry */
+		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
+					GFP_KERNEL);
+		if (!ib_cmem_block)
+			goto err_alloc;
+
+		if (numa_node < 0)
+			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
+					    __GFP_COMP | __GFP_NOWARN,
+					    ncontiguous_pages_order);
+		else
+			page =  alloc_pages_node(numa_node,
+						 GFP_HIGHUSER | __GFP_ZERO |
+						 __GFP_COMP | __GFP_NOWARN,
+						 ncontiguous_pages_order);
+
+		if (!page) {
+			kfree(ib_cmem_block);
+			/* We should deallocate previous succeeded allocatations
+			  * if exists.
+			  */
+			goto err_alloc;
+		}
+
+		ib_cmem_block->page = page;
+		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
+	}
+
+	cmem->length = total_size;
+	current->mm->pinned_vm = locked;
+	return cmem;
+
+err_alloc:
+	ib_cmem_release_contiguous_pages(cmem);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
new file mode 100644
index 0000000..5f26a49
--- /dev/null
+++ b/include/rdma/ib_cmem.h
@@ -0,0 +1,41 @@ 
+#ifndef IB_CMEM_H
+#define IB_CMEM_H
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+
+/* contiguous memory structure */
+struct ib_cmem {
+	struct ib_ucontext     *context;
+	size_t			length;
+	/* Link list of contiguous blocks being part of that cmem  */
+	struct list_head ib_cmem_block;
+
+	/* Order of cmem block,  2^ block_order will equal number
+	  * of physical pages per block
+	  */
+	unsigned long    block_order;
+	/* Refernce counter for that memory area
+	  * When value became 0 pages will be returned to the kernel.
+	  */
+	struct kref refcount;
+};
+
+struct ib_cmem_block {
+	struct list_head	list;
+	/* page will point to the page struct of the head page
+	  * in the current compound page.
+	  * block order is saved once as part of ib_cmem.
+	  */
+	struct page            *page;
+};
+
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+					struct vm_area_struct *vma);
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+					       unsigned long total_size,
+					       unsigned long page_size_order,
+					       int numa_node);
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
+
+#endif