From patchwork Tue Dec  8 15:15:06 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Yishai Hadas <yishaih@mellanox.com>
X-Patchwork-Id: 7798601
Return-Path: <linux-rdma-owner@kernel.org>
X-Original-To: patchwork-linux-rdma@patchwork.kernel.org
Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org
Received: from mail.kernel.org (mail.kernel.org [198.145.29.136])
	by patchwork1.web.kernel.org (Postfix) with ESMTP id 748679F387
	for <patchwork-linux-rdma@patchwork.kernel.org>;
	Tue,  8 Dec 2015 15:16:51 +0000 (UTC)
Received: from mail.kernel.org (localhost [127.0.0.1])
	by mail.kernel.org (Postfix) with ESMTP id 36FDB20396
	for <patchwork-linux-rdma@patchwork.kernel.org>;
	Tue,  8 Dec 2015 15:16:50 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by mail.kernel.org (Postfix) with ESMTP id C86AF2052A
	for <patchwork-linux-rdma@patchwork.kernel.org>;
	Tue,  8 Dec 2015 15:16:48 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S932971AbbLHPQm (ORCPT
	<rfc822;patchwork-linux-rdma@patchwork.kernel.org>);
	Tue, 8 Dec 2015 10:16:42 -0500
Received: from [193.47.165.129] ([193.47.165.129]:39243 "EHLO mellanox.co.il"
	rhost-flags-FAIL-FAIL-OK-FAIL) by vger.kernel.org with ESMTP
	id S932954AbbLHPQg (ORCPT <rfc822;linux-rdma@vger.kernel.org>);
	Tue, 8 Dec 2015 10:16:36 -0500
Received: from Internal Mail-Server by MTLPINE1 (envelope-from
	yishaih@mellanox.com)
	with ESMTPS (AES256-SHA encrypted); 8 Dec 2015 17:16:10 +0200
Received: from vnc17.mtl.labs.mlnx (vnc17.mtl.labs.mlnx [10.7.2.17])
	by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id tB8FG9Ei026668;
	Tue, 8 Dec 2015 17:16:09 +0200
Received: from vnc17.mtl.labs.mlnx (localhost.localdomain [127.0.0.1])
	by vnc17.mtl.labs.mlnx (8.13.8/8.13.8) with ESMTP id tB8FG9fk026221;
	Tue, 8 Dec 2015 17:16:09 +0200
Received: (from yishaih@localhost)
	by vnc17.mtl.labs.mlnx (8.13.8/8.13.8/Submit) id tB8FG9bE026220;
	Tue, 8 Dec 2015 17:16:09 +0200
From: Yishai Hadas <yishaih@mellanox.com>
To: dledford@redhat.com
Cc: linux-rdma@vger.kernel.org, yishaih@mellanox.com,
	ogerlitz@mellanox.com, talal@mellanox.com
Subject: [RFC contig pages support 1/2] IB: Supports contiguous memory
	operations
Date: Tue,  8 Dec 2015 17:15:06 +0200
Message-Id: <1449587707-24214-2-git-send-email-yishaih@mellanox.com>
X-Mailer: git-send-email 1.7.11.3
In-Reply-To: <1449587707-24214-1-git-send-email-yishaih@mellanox.com>
References: <1449587707-24214-1-git-send-email-yishaih@mellanox.com>
Sender: linux-rdma-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-rdma.vger.kernel.org>
X-Mailing-List: linux-rdma@vger.kernel.org
X-Spam-Status: No, score=-6.9 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_HI,
	T_RP_MATCHES_RCVD,
	UNPARSEABLE_RELAY autolearn=ham version=3.3.1
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org
X-Virus-Scanned: ClamAV using ClamSMTP

New structure 'cmem' represents the contiguous allocated memory.
It supports:
Allocate, Free, 'Map to virtual address' operations, etc.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
---
 drivers/infiniband/core/Makefile |   2 +-
 drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_cmem.h           |  41 +++++++
 3 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/cmem.c
 create mode 100644 include/rdma/ib_cmem.h

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a899..8549ea4 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
 ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
new file mode 100644
index 0000000..21d8573
--- /dev/null
+++ b/drivers/infiniband/core/cmem.c
@@ -0,0 +1,245 @@
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+#include <rdma/ib_cmem.h>
+#include "uverbs.h"
+
+static void ib_cmem_release(struct kref *ref)
+{
+	struct ib_cmem *cmem;
+	struct ib_cmem_block *cmem_block, *tmp;
+	unsigned long ntotal_pages;
+
+	cmem = container_of(ref, struct ib_cmem, refcount);
+
+	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
+		__free_pages(cmem_block->page, cmem->block_order);
+		list_del(&cmem_block->list);
+		kfree(cmem_block);
+	}
+	/* no locking is needed:
+	  * ib_cmem_release is called from vm_close which is always called
+	  * with mm->mmap_sem held for writing.
+	  * The only exception is when the process shutting down but in that case
+	  * counter not relevant any more.
+	  */
+	if (current->mm) {
+		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
+		current->mm->pinned_vm -= ntotal_pages;
+	}
+	kfree(cmem);
+}
+
+/**
+ * ib_cmem_release_contiguous_pages - release memory allocated by
+ *                                              ib_cmem_alloc_contiguous_pages.
+ * @cmem: cmem struct to release
+ */
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
+{
+	kref_put(&cmem->refcount, ib_cmem_release);
+}
+EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
+
+static void cmem_vma_open(struct vm_area_struct *area)
+{
+	struct ib_cmem *ib_cmem;
+
+	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
+
+	/* vm_open and vm_close are always called with mm->mmap_sem held for
+	  * writing. The only exception is when the process is shutting down, at
+	  * which point vm_close is called with no locks held, but since it is
+	  * after the VMAs have been detached, it is impossible that vm_open will
+	  * be called. Therefore, there is no need to synchronize the kref_get and
+	  * kref_put calls.
+	*/
+	kref_get(&ib_cmem->refcount);
+}
+
+static void cmem_vma_close(struct vm_area_struct *area)
+{
+	struct ib_cmem *cmem;
+
+	cmem = (struct ib_cmem *)(area->vm_private_data);
+
+	ib_cmem_release_contiguous_pages(cmem);
+}
+
+static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
+	.open = cmem_vma_open,
+	.close = cmem_vma_close
+};
+
+/**
+ * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
+ * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
+ * @vma: VMA to inject pages into.
+ */
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+					struct vm_area_struct *vma)
+{
+	int ret;
+	unsigned long page_entry;
+	unsigned long ntotal_pages;
+	unsigned long ncontig_pages;
+	unsigned long total_size;
+	struct page *page;
+	unsigned long vma_entry_number = 0;
+	struct ib_cmem_block *ib_cmem_block = NULL;
+
+	total_size = vma->vm_end - vma->vm_start;
+	if (ib_cmem->length != total_size)
+		return -EINVAL;
+
+	if (total_size != PAGE_ALIGN(total_size)) {
+		WARN(1,
+		     "ib_cmem_map: total size %lu not aligned to page size\n",
+		     total_size);
+		return -EINVAL;
+	}
+
+	ntotal_pages = total_size >> PAGE_SHIFT;
+	ncontig_pages = 1 << ib_cmem->block_order;
+
+	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
+		page = ib_cmem_block->page;
+		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
+			/* We reached end of vma - going out from both loops */
+			if (vma_entry_number >= ntotal_pages)
+				goto end;
+
+			ret = vm_insert_page(vma, vma->vm_start +
+				(vma_entry_number << PAGE_SHIFT), page);
+			if (ret < 0)
+				goto err_vm_insert;
+
+			vma_entry_number++;
+			page++;
+		}
+	}
+
+end:
+
+	/* We expect to have enough pages   */
+	if (vma_entry_number >= ntotal_pages) {
+		vma->vm_ops =  &cmem_contig_pages_vm_ops;
+		vma->vm_private_data = ib_cmem;
+		return 0;
+	}
+	/* Not expected but if we reached here
+	  * not enough contiguous pages were registered
+	  */
+	ret = -EINVAL;
+
+err_vm_insert:
+
+	zap_vma_ptes(vma, vma->vm_start, total_size);
+	return ret;
+}
+EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
+
+/**
+ * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
+ * @context: userspace context to allocate memory for
+ * @total_size: total required size for that allocation.
+ * @page_size_order: order of one contiguous page.
+ * @numa_nude: From which numa node to allocate memory
+ *             when numa_nude < 0 use default numa_nude.
+ */
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+					       unsigned long total_size,
+					       unsigned long page_size_order,
+					       int numa_node)
+{
+	struct ib_cmem *cmem;
+	unsigned long ntotal_pages;
+	unsigned long ncontiguous_pages;
+	unsigned long ncontiguous_groups;
+	struct page *page;
+	int i;
+	int ncontiguous_pages_order;
+	struct ib_cmem_block *ib_cmem_block;
+	unsigned long locked;
+	unsigned long lock_limit;
+
+	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
+		return ERR_PTR(-EINVAL);
+
+	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
+	if (!cmem)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&cmem->refcount);
+	cmem->context   = context;
+	INIT_LIST_HEAD(&cmem->ib_cmem_block);
+
+	/* Total size is expected to be already page aligned -
+	  * verifying anyway.
+	  */
+	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
+	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
+	  * with mm->mmap_sem held for writing.
+	  * No need to lock
+	  */
+	locked     = ntotal_pages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+		goto err_alloc;
+
+	/* How many contiguous pages do we need in 1 block */
+	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
+	ncontiguous_pages_order = ilog2(ncontiguous_pages);
+	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
+		(!!(ntotal_pages & (ncontiguous_pages - 1)));
+
+	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
+	if (ncontiguous_pages_order >= MAX_ORDER)
+		goto err_alloc;
+	/* we set block_order before starting allocation to prevent
+	  * a leak in a failure flow in ib_cmem_release.
+	  * cmem->length has at that step value 0 from kzalloc as expected
+	  */
+	cmem->block_order = ncontiguous_pages_order;
+	for (i = 0; i < ncontiguous_groups; i++) {
+		/* Allocating the managed entry */
+		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
+					GFP_KERNEL);
+		if (!ib_cmem_block)
+			goto err_alloc;
+
+		if (numa_node < 0)
+			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
+					    __GFP_COMP | __GFP_NOWARN,
+					    ncontiguous_pages_order);
+		else
+			page =  alloc_pages_node(numa_node,
+						 GFP_HIGHUSER | __GFP_ZERO |
+						 __GFP_COMP | __GFP_NOWARN,
+						 ncontiguous_pages_order);
+
+		if (!page) {
+			kfree(ib_cmem_block);
+			/* We should deallocate previous succeeded allocatations
+			  * if exists.
+			  */
+			goto err_alloc;
+		}
+
+		ib_cmem_block->page = page;
+		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
+	}
+
+	cmem->length = total_size;
+	current->mm->pinned_vm = locked;
+	return cmem;
+
+err_alloc:
+	ib_cmem_release_contiguous_pages(cmem);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
new file mode 100644
index 0000000..5f26a49
--- /dev/null
+++ b/include/rdma/ib_cmem.h
@@ -0,0 +1,41 @@
+#ifndef IB_CMEM_H
+#define IB_CMEM_H
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+
+/* contiguous memory structure */
+struct ib_cmem {
+	struct ib_ucontext     *context;
+	size_t			length;
+	/* Link list of contiguous blocks being part of that cmem  */
+	struct list_head ib_cmem_block;
+
+	/* Order of cmem block,  2^ block_order will equal number
+	  * of physical pages per block
+	  */
+	unsigned long    block_order;
+	/* Refernce counter for that memory area
+	  * When value became 0 pages will be returned to the kernel.
+	  */
+	struct kref refcount;
+};
+
+struct ib_cmem_block {
+	struct list_head	list;
+	/* page will point to the page struct of the head page
+	  * in the current compound page.
+	  * block order is saved once as part of ib_cmem.
+	  */
+	struct page            *page;
+};
+
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+					struct vm_area_struct *vma);
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+					       unsigned long total_size,
+					       unsigned long page_size_order,
+					       int numa_node);
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
+
+#endif