diff mbox

[v2,rdma-next,17/29] IB/rxe: Memory Region (MR) manioulation

Message ID 1466084751-29901-18-git-send-email-monis@mellanox.com (mailing list archive)
State Accepted
Headers show

Commit Message

Moni Shoua June 16, 2016, 1:45 p.m. UTC
Infrastructure for memory region handling in Soft RoCE.

Signed-off-by: Kamal Heib <kamalh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
---
 drivers/infiniband/hw/rxe/rxe_mr.c | 679 +++++++++++++++++++++++++++++++++++++
 1 file changed, 679 insertions(+)
 create mode 100644 drivers/infiniband/hw/rxe/rxe_mr.c
diff mbox

Patch

diff --git a/drivers/infiniband/hw/rxe/rxe_mr.c b/drivers/infiniband/hw/rxe/rxe_mr.c
new file mode 100644
index 0000000..e533249
--- /dev/null
+++ b/drivers/infiniband/hw/rxe/rxe_mr.c
@@ -0,0 +1,679 @@ 
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/*
+ * lfsr (linear feedback shift register) with period 255
+ */
+static u8 rxe_get_key(void)
+{
+	static unsigned key = 1;
+
+	key = key << 1;
+
+	key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10))
+		^ (0 != (key & 0x80)) ^ (0 != (key & 0x40));
+
+	key &= 0xff;
+
+	return key;
+}
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length)
+{
+	switch (mem->type) {
+	case RXE_MEM_TYPE_DMA:
+		return 0;
+
+	case RXE_MEM_TYPE_MR:
+	case RXE_MEM_TYPE_FMR:
+		return ((iova < mem->iova) ||
+			((iova + length) > (mem->iova + mem->length))) ?
+			-EFAULT : 0;
+
+	default:
+		return -EFAULT;
+	}
+}
+
+#define IB_ACCESS_REMOTE	(IB_ACCESS_REMOTE_READ		\
+				| IB_ACCESS_REMOTE_WRITE	\
+				| IB_ACCESS_REMOTE_ATOMIC)
+
+static void rxe_mem_init(int access, struct rxe_mem *mem)
+{
+	u32 lkey = mem->pelem.index << 8 | rxe_get_key();
+	u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
+
+	if (mem->pelem.pool->type == RXE_TYPE_MR) {
+		mem->ibmr.lkey		= lkey;
+		mem->ibmr.rkey		= rkey;
+	} else {
+		mem->ibfmr.lkey		= lkey;
+		mem->ibfmr.rkey		= rkey;
+	}
+
+	mem->lkey		= lkey;
+	mem->rkey		= rkey;
+	mem->state		= RXE_MEM_STATE_INVALID;
+	mem->type		= RXE_MEM_TYPE_NONE;
+	mem->map_shift		= ilog2(RXE_BUF_PER_MAP);
+}
+
+void rxe_mem_cleanup(void *arg)
+{
+	struct rxe_mem *mem = arg;
+	int i;
+
+	if (mem->umem)
+		ib_umem_release(mem->umem);
+
+	if (mem->map) {
+		for (i = 0; i < mem->num_map; i++)
+			kfree(mem->map[i]);
+
+		kfree(mem->map);
+	}
+}
+
+static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf)
+{
+	int i;
+	int num_map;
+	struct rxe_map **map = mem->map;
+
+	num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
+
+	mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
+	if (!mem->map)
+		goto err1;
+
+	for (i = 0; i < num_map; i++) {
+		mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
+		if (!mem->map[i])
+			goto err2;
+	}
+
+	WARN_ON(!is_power_of_2(RXE_BUF_PER_MAP));
+
+	mem->map_shift	= ilog2(RXE_BUF_PER_MAP);
+	mem->map_mask	= RXE_BUF_PER_MAP - 1;
+
+	mem->num_buf = num_buf;
+	mem->num_map = num_map;
+	mem->max_buf = num_map * RXE_BUF_PER_MAP;
+
+	return 0;
+
+err2:
+	for (i--; i >= 0; i--)
+		kfree(mem->map[i]);
+
+	kfree(mem->map);
+err1:
+	return -ENOMEM;
+}
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+		     int access, struct rxe_mem *mem)
+{
+	rxe_mem_init(access, mem);
+
+	mem->pd			= pd;
+	mem->access		= access;
+	mem->state		= RXE_MEM_STATE_VALID;
+	mem->type		= RXE_MEM_TYPE_DMA;
+
+	return 0;
+}
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+		      u64 length, u64 iova, int access, struct ib_udata *udata,
+		      struct rxe_mem *mem)
+{
+	int			entry;
+	struct rxe_map		**map;
+	struct rxe_phys_buf	*buf = NULL;
+	struct ib_umem		*umem;
+	struct scatterlist	*sg;
+	int			num_buf;
+	void			*vaddr;
+	int err;
+
+	umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+	if (IS_ERR(umem)) {
+		pr_warn("err %d from rxe_umem_get\n",
+			(int)PTR_ERR(umem));
+		err = -EINVAL;
+		goto err1;
+	}
+
+	mem->umem = umem;
+	num_buf = umem->nmap;
+
+	rxe_mem_init(access, mem);
+
+	err = rxe_mem_alloc(rxe, mem, num_buf);
+	if (err) {
+		pr_warn("err %d from rxe_mem_alloc\n", err);
+		ib_umem_release(umem);
+		goto err1;
+	}
+
+	WARN_ON(!is_power_of_2(umem->page_size));
+
+	mem->page_shift		= ilog2(umem->page_size);
+	mem->page_mask		= umem->page_size - 1;
+
+	num_buf			= 0;
+	map			= mem->map;
+	if (length > 0) {
+		buf = map[0]->buf;
+
+		for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+			vaddr = page_address(sg_page(sg));
+			if (!vaddr) {
+				pr_warn("null vaddr\n");
+				err = -ENOMEM;
+				goto err1;
+			}
+
+			buf->addr = (uintptr_t)vaddr;
+			buf->size = umem->page_size;
+			num_buf++;
+			buf++;
+
+			if (num_buf >= RXE_BUF_PER_MAP) {
+				map++;
+				buf = map[0]->buf;
+				num_buf = 0;
+			}
+		}
+	}
+
+	mem->pd			= pd;
+	mem->umem		= umem;
+	mem->access		= access;
+	mem->length		= length;
+	mem->iova		= iova;
+	mem->va			= start;
+	mem->offset		= ib_umem_offset(umem);
+	mem->state		= RXE_MEM_STATE_VALID;
+	mem->type		= RXE_MEM_TYPE_MR;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+		      int max_pages, struct rxe_mem *mem)
+{
+	int err;
+
+	rxe_mem_init(0, mem);
+
+	err = rxe_mem_alloc(rxe, mem, max_pages);
+	if (err)
+		goto err1;
+
+	mem->pd			= pd;
+	mem->max_buf		= max_pages;
+	mem->state		= RXE_MEM_STATE_FREE;
+	mem->type		= RXE_MEM_TYPE_MR;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+int rxe_mem_init_fmr(struct rxe_dev *rxe, struct rxe_pd *pd, int access,
+		     struct ib_fmr_attr *attr, struct rxe_mem *mem)
+{
+	int err;
+
+	if (attr->max_maps > rxe->attr.max_map_per_fmr) {
+		pr_warn("max_mmaps = %d too big, max_map_per_fmr = %d\n",
+			attr->max_maps, rxe->attr.max_map_per_fmr);
+		err = -EINVAL;
+		goto err1;
+	}
+
+	rxe_mem_init(access, mem);
+
+	err = rxe_mem_alloc(rxe, mem, attr->max_pages);
+	if (err)
+		goto err1;
+
+	mem->pd			= pd;
+	mem->access		= access;
+	mem->page_shift		 = attr->page_shift;
+	mem->page_mask		= (1 << attr->page_shift) - 1;
+	mem->max_buf		= attr->max_pages;
+	mem->state		= RXE_MEM_STATE_FREE;
+	mem->type		= RXE_MEM_TYPE_FMR;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+static void lookup_iova(
+	struct rxe_mem	*mem,
+	u64			iova,
+	int			*m_out,
+	int			*n_out,
+	size_t			*offset_out)
+{
+	size_t			offset = iova - mem->iova + mem->offset;
+	int			map_index;
+	int			buf_index;
+	u64			length;
+
+	if (likely(mem->page_shift)) {
+		*offset_out = offset & mem->page_mask;
+		offset >>= mem->page_shift;
+		*n_out = offset & mem->map_mask;
+		*m_out = offset >> mem->map_shift;
+	} else {
+		map_index = 0;
+		buf_index = 0;
+
+		length = mem->map[map_index]->buf[buf_index].size;
+
+		while (offset >= length) {
+			offset -= length;
+			buf_index++;
+
+			if (buf_index == RXE_BUF_PER_MAP) {
+				map_index++;
+				buf_index = 0;
+			}
+			length = mem->map[map_index]->buf[buf_index].size;
+		}
+
+		*m_out = map_index;
+		*n_out = buf_index;
+		*offset_out = offset;
+	}
+}
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length)
+{
+	size_t offset;
+	int m, n;
+	void *addr;
+
+	if (mem->state != RXE_MEM_STATE_VALID) {
+		pr_warn("mem not in valid state\n");
+		addr = NULL;
+		goto out;
+	}
+
+	if (!mem->map) {
+		addr = (void *)(uintptr_t)iova;
+		goto out;
+	}
+
+	if (mem_check_range(mem, iova, length)) {
+		pr_warn("range violation\n");
+		addr = NULL;
+		goto out;
+	}
+
+	lookup_iova(mem, iova, &m, &n, &offset);
+
+	if (offset + length > mem->map[m]->buf[n].size) {
+		pr_warn("crosses page boundary\n");
+		addr = NULL;
+		goto out;
+	}
+
+	addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset;
+
+out:
+	return addr;
+}
+
+/* copy data from a range (vaddr, vaddr+length-1) to or from
+ * a mem object starting at iova. Compute incremental value of
+ * crc32 if crcp is not zero. caller must hold a reference to mem
+ */
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
+		 enum copy_direction dir, u32 *crcp)
+{
+	int			err;
+	int			bytes;
+	u8			*va;
+	struct rxe_map		**map;
+	struct rxe_phys_buf	*buf;
+	int			m;
+	int			i;
+	size_t			offset;
+	u32			crc = crcp ? (*crcp) : 0;
+
+	if (mem->type == RXE_MEM_TYPE_DMA) {
+		u8 *src, *dest;
+
+		src  = (dir == to_mem_obj) ?
+			addr : ((void *)(uintptr_t)iova);
+
+		dest = (dir == to_mem_obj) ?
+			((void *)(uintptr_t)iova) : addr;
+
+		if (crcp)
+			*crcp = crc32_le(*crcp, src, length);
+
+		memcpy(dest, src, length);
+
+		return 0;
+	}
+
+	WARN_ON(!mem->map);
+
+	err = mem_check_range(mem, iova, length);
+	if (err) {
+		err = -EFAULT;
+		goto err1;
+	}
+
+	lookup_iova(mem, iova, &m, &i, &offset);
+
+	map	= mem->map + m;
+	buf	= map[0]->buf + i;
+
+	while (length > 0) {
+		u8 *src, *dest;
+
+		va	= (u8 *)(uintptr_t)buf->addr + offset;
+		src  = (dir == to_mem_obj) ? addr : va;
+		dest = (dir == to_mem_obj) ? va : addr;
+
+		bytes	= buf->size - offset;
+
+		if (bytes > length)
+			bytes = length;
+
+		if (crcp)
+			crc = crc32_le(crc, src, bytes);
+
+		memcpy(dest, src, bytes);
+
+		length	-= bytes;
+		addr	+= bytes;
+
+		offset	= 0;
+		buf++;
+		i++;
+
+		if (i == RXE_BUF_PER_MAP) {
+			i = 0;
+			map++;
+			buf = map[0]->buf;
+		}
+	}
+
+	if (crcp)
+		*crcp = crc;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+/* copy data in or out of a wqe, i.e. sg list
+ * under the control of a dma descriptor
+ */
+int copy_data(
+	struct rxe_dev		*rxe,
+	struct rxe_pd		*pd,
+	int			access,
+	struct rxe_dma_info	*dma,
+	void			*addr,
+	int			length,
+	enum copy_direction	dir,
+	u32			*crcp)
+{
+	int			bytes;
+	struct rxe_sge		*sge	= &dma->sge[dma->cur_sge];
+	int			offset	= dma->sge_offset;
+	int			resid	= dma->resid;
+	struct rxe_mem		*mem	= NULL;
+	u64			iova;
+	int			err;
+
+	if (length == 0)
+		return 0;
+
+	if (length > resid) {
+		err = -EINVAL;
+		goto err2;
+	}
+
+	if (sge->length && (offset < sge->length)) {
+		mem = lookup_mem(pd, access, sge->lkey, lookup_local);
+		if (!mem) {
+			err = -EINVAL;
+			goto err1;
+		}
+	}
+
+	while (length > 0) {
+		bytes = length;
+
+		if (offset >= sge->length) {
+			if (mem) {
+				rxe_drop_ref(mem);
+				mem = NULL;
+			}
+			sge++;
+			dma->cur_sge++;
+			offset = 0;
+
+			if (dma->cur_sge >= dma->num_sge) {
+				err = -ENOSPC;
+				goto err2;
+			}
+
+			if (sge->length) {
+				mem = lookup_mem(pd, access, sge->lkey,
+						 lookup_local);
+				if (!mem) {
+					err = -EINVAL;
+					goto err1;
+				}
+			} else {
+				continue;
+			}
+		}
+
+		if (bytes > sge->length - offset)
+			bytes = sge->length - offset;
+
+		if (bytes > 0) {
+			iova = sge->addr + offset;
+
+			err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp);
+			if (err)
+				goto err2;
+
+			offset	+= bytes;
+			resid	-= bytes;
+			length	-= bytes;
+			addr	+= bytes;
+		}
+	}
+
+	dma->sge_offset = offset;
+	dma->resid	= resid;
+
+	if (mem)
+		rxe_drop_ref(mem);
+
+	return 0;
+
+err2:
+	if (mem)
+		rxe_drop_ref(mem);
+err1:
+	return err;
+}
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
+{
+	struct rxe_sge		*sge	= &dma->sge[dma->cur_sge];
+	int			offset	= dma->sge_offset;
+	int			resid	= dma->resid;
+
+	while (length) {
+		unsigned int bytes;
+
+		if (offset >= sge->length) {
+			sge++;
+			dma->cur_sge++;
+			offset = 0;
+			if (dma->cur_sge >= dma->num_sge)
+				return -ENOSPC;
+		}
+
+		bytes = length;
+
+		if (bytes > sge->length - offset)
+			bytes = sge->length - offset;
+
+		offset	+= bytes;
+		resid	-= bytes;
+		length	-= bytes;
+	}
+
+	dma->sge_offset = offset;
+	dma->resid	= resid;
+
+	return 0;
+}
+
+/* (1) find the mem (mr, fmr or mw) corresponding to lkey/rkey
+ *     depending on lookup_type
+ * (2) verify that the (qp) pd matches the mem pd
+ * (3) verify that the mem can support the requested access
+ * (4) verify that mem state is valid
+ */
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+			   enum lookup_type type)
+{
+	struct rxe_mem *mem;
+	struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
+	int index = key >> 8;
+
+	if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) {
+		mem = rxe_pool_get_index(&rxe->mr_pool, index);
+		if (!mem)
+			goto err1;
+	} else if (index >= RXE_MIN_FMR_INDEX && index <= RXE_MAX_FMR_INDEX) {
+		mem = rxe_pool_get_index(&rxe->fmr_pool, index);
+		if (!mem)
+			goto err1;
+	} else {
+		goto err1;
+	}
+
+	if ((type == lookup_local && mem->lkey != key) ||
+	    (type == lookup_remote && mem->rkey != key))
+		goto err2;
+
+	if (mem->pd != pd)
+		goto err2;
+
+	if (access && !(access & mem->access))
+		goto err2;
+
+	if (mem->state != RXE_MEM_STATE_VALID)
+		goto err2;
+
+	return mem;
+
+err2:
+	rxe_drop_ref(mem);
+err1:
+	return NULL;
+}
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+		      u64 *page, int num_pages, u64 iova)
+{
+	int i;
+	int num_buf;
+	int err;
+	struct rxe_map **map;
+	struct rxe_phys_buf *buf;
+	int page_size;
+
+	if (num_pages > mem->max_buf) {
+		err = -EINVAL;
+		goto err1;
+	}
+
+	num_buf		= 0;
+	page_size	= 1 << mem->page_shift;
+	map		= mem->map;
+	buf		= map[0]->buf;
+
+	for (i = 0; i < num_pages; i++) {
+		buf->addr = *page++;
+		buf->size = page_size;
+		buf++;
+		num_buf++;
+
+		if (num_buf == RXE_BUF_PER_MAP) {
+			map++;
+			buf = map[0]->buf;
+			num_buf = 0;
+		}
+	}
+
+	mem->iova	= iova;
+	mem->va		= iova;
+	mem->length	= num_pages << mem->page_shift;
+	mem->state	= RXE_MEM_STATE_VALID;
+
+	return 0;
+
+err1:
+	return err;
+}