diff mbox series

[rdma-core,v3,4/6] pyverbs: Add dma-buf based MR support

Message ID 1606510543-45567-5-git-send-email-jianxin.xiong@intel.com (mailing list archive)
State New, archived
Headers show
Series Add user space dma-buf support | expand

Commit Message

Xiong, Jianxin Nov. 27, 2020, 8:55 p.m. UTC
Define a new sub-class of 'MR' that uses dma-buf object for the memory
region. Define a new class 'DmaBuf' as a wrapper for dma-buf allocation
mechanism implemented in C.

Add a method to buildlib for building modules with mixed Cython and C
source.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
---
 buildlib/pyverbs_functions.cmake |  52 +++++++
 pyverbs/CMakeLists.txt           |   7 +
 pyverbs/dmabuf.pxd               |  15 ++
 pyverbs/dmabuf.pyx               |  72 ++++++++++
 pyverbs/dmabuf_alloc.c           | 296 +++++++++++++++++++++++++++++++++++++++
 pyverbs/dmabuf_alloc.h           |  19 +++
 pyverbs/libibverbs.pxd           |   2 +
 pyverbs/mr.pxd                   |   6 +
 pyverbs/mr.pyx                   | 103 +++++++++++++-
 9 files changed, 570 insertions(+), 2 deletions(-)
 create mode 100644 pyverbs/dmabuf.pxd
 create mode 100644 pyverbs/dmabuf.pyx
 create mode 100644 pyverbs/dmabuf_alloc.c
 create mode 100644 pyverbs/dmabuf_alloc.h

Comments

Daniel Vetter Nov. 30, 2020, 2:57 p.m. UTC | #1
On Fri, Nov 27, 2020 at 12:55:41PM -0800, Jianxin Xiong wrote:
> Define a new sub-class of 'MR' that uses dma-buf object for the memory
> region. Define a new class 'DmaBuf' as a wrapper for dma-buf allocation
> mechanism implemented in C.
> 
> Add a method to buildlib for building modules with mixed Cython and C
> source.
> 
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> ---
>  buildlib/pyverbs_functions.cmake |  52 +++++++
>  pyverbs/CMakeLists.txt           |   7 +
>  pyverbs/dmabuf.pxd               |  15 ++
>  pyverbs/dmabuf.pyx               |  72 ++++++++++
>  pyverbs/dmabuf_alloc.c           | 296 +++++++++++++++++++++++++++++++++++++++
>  pyverbs/dmabuf_alloc.h           |  19 +++
>  pyverbs/libibverbs.pxd           |   2 +
>  pyverbs/mr.pxd                   |   6 +
>  pyverbs/mr.pyx                   | 103 +++++++++++++-
>  9 files changed, 570 insertions(+), 2 deletions(-)
>  create mode 100644 pyverbs/dmabuf.pxd
>  create mode 100644 pyverbs/dmabuf.pyx
>  create mode 100644 pyverbs/dmabuf_alloc.c
>  create mode 100644 pyverbs/dmabuf_alloc.h
> 
> diff --git a/buildlib/pyverbs_functions.cmake b/buildlib/pyverbs_functions.cmake
> index 953cec2..2f6788e 100644
> --- a/buildlib/pyverbs_functions.cmake
> +++ b/buildlib/pyverbs_functions.cmake
> @@ -1,5 +1,6 @@
>  # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
>  # Copyright (c) 2018, Mellanox Technologies. All rights reserved.  See COPYING file
> +# Copyright (c) 2020, Intel Corporation. All rights reserved.  See COPYING file
>  
>  function(rdma_cython_module PY_MODULE LINKER_FLAGS)
>    foreach(PYX_FILE ${ARGN})
> @@ -32,6 +33,57 @@ function(rdma_cython_module PY_MODULE LINKER_FLAGS)
>    endforeach()
>  endfunction()
>  
> +function(rdma_multifile_module PY_MODULE MODULE_NAME LINKER_FLAGS)
> +  set(ALL_CFILES "")
> +  foreach(SRC_FILE ${ARGN})
> +    get_filename_component(FILENAME ${SRC_FILE} NAME_WE)
> +    get_filename_component(DIR ${SRC_FILE} DIRECTORY)
> +    get_filename_component(EXT ${SRC_FILE} EXT)
> +    if (DIR)
> +      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${DIR}")
> +    else()
> +      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
> +    endif()
> +    if (${EXT} STREQUAL ".pyx")
> +      set(PYX "${SRC_PATH}/${FILENAME}.pyx")
> +      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
> +      include_directories(${PYTHON_INCLUDE_DIRS})
> +      add_custom_command(
> +        OUTPUT "${CFILE}"
> +        MAIN_DEPENDENCY "${PYX}"
> +        COMMAND ${CYTHON_EXECUTABLE} "${PYX}" -o "${CFILE}"
> +        "-I${PYTHON_INCLUDE_DIRS}"
> +        COMMENT "Cythonizing ${PYX}"
> +      )
> +      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
> +    elseif(${EXT} STREQUAL ".c")
> +      set(CFILE_ORIG "${SRC_PATH}/${FILENAME}.c")
> +      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
> +      if (NOT ${CFILE_ORIG} STREQUAL ${CFILE})
> +        rdma_create_symlink("${CFILE_ORIG}" "${CFILE}")
> +      endif()
> +      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
> +    elseif(${EXT} STREQUAL ".h")
> +      set(HFILE_ORIG "${SRC_PATH}/${FILENAME}.h")
> +      set(HFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.h")
> +      if (NOT ${HFILE_ORIG} STREQUAL ${HFILE})
> +        rdma_create_symlink("${HFILE_ORIG}" "${HFILE}")
> +      endif()
> +    else()
> +      continue()
> +    endif()
> +  endforeach()
> +  string(REGEX REPLACE "\\.so$" "" SONAME "${MODULE_NAME}${CMAKE_PYTHON_SO_SUFFIX}")
> +  add_library(${SONAME} SHARED ${ALL_CFILES})
> +  set_target_properties(${SONAME} PROPERTIES
> +    COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -fno-strict-aliasing -Wno-unused-function -Wno-redundant-decls -Wno-shadow -Wno-cast-function-type -Wno-implicit-fallthrough -Wno-unknown-warning -Wno-unknown-warning-option -Wno-deprecated-declarations ${NO_VAR_TRACKING_FLAGS}"
> +    LIBRARY_OUTPUT_DIRECTORY "${BUILD_PYTHON}/${PY_MODULE}"
> +    PREFIX "")
> +  target_link_libraries(${SONAME} LINK_PRIVATE ${PYTHON_LIBRARIES} ibverbs rdmacm ${LINKER_FLAGS})
> +  install(TARGETS ${SONAME}
> +    DESTINATION ${CMAKE_INSTALL_PYTHON_ARCH_LIB}/${PY_MODULE})
> +endfunction()
> +
>  function(rdma_python_module PY_MODULE)
>    foreach(PY_FILE ${ARGN})
>      get_filename_component(LINK "${CMAKE_CURRENT_SOURCE_DIR}/${PY_FILE}" ABSOLUTE)
> diff --git a/pyverbs/CMakeLists.txt b/pyverbs/CMakeLists.txt
> index 9542c4b..1b21e7b 100644
> --- a/pyverbs/CMakeLists.txt
> +++ b/pyverbs/CMakeLists.txt
> @@ -1,5 +1,6 @@
>  # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
>  # Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
> +# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
>  
>  rdma_cython_module(pyverbs ""
>    addr.pyx
> @@ -24,6 +25,12 @@ rdma_python_module(pyverbs
>    utils.py
>    )
>  
> +rdma_multifile_module(pyverbs dmabuf ""
> +  dmabuf.pyx
> +  dmabuf_alloc.c
> +  dmabuf_alloc.h
> +  )
> +
>  # mlx5 and efa providers are not built without coherent DMA, e.g. ARM32 build.
>  if (HAVE_COHERENT_DMA)
>  add_subdirectory(providers/mlx5)
> diff --git a/pyverbs/dmabuf.pxd b/pyverbs/dmabuf.pxd
> new file mode 100644
> index 0000000..3ef5dfb
> --- /dev/null
> +++ b/pyverbs/dmabuf.pxd
> @@ -0,0 +1,15 @@
> +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
> +# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
> +
> +#cython: language_level=3
> +
> +cdef class DmaBuf:
> +    cdef int dri_fd
> +    cdef int handle
> +    cdef int fd
> +    cdef unsigned long size
> +    cdef unsigned long map_offset
> +    cdef void *dmabuf
> +    cdef object dmabuf_mrs
> +    cdef add_ref(self, obj)
> +    cpdef close(self)
> diff --git a/pyverbs/dmabuf.pyx b/pyverbs/dmabuf.pyx
> new file mode 100644
> index 0000000..23d8e2a
> --- /dev/null
> +++ b/pyverbs/dmabuf.pyx
> @@ -0,0 +1,72 @@
> +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
> +# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
> +
> +#cython: language_level=3
> +
> +import weakref
> +
> +from pyverbs.base cimport close_weakrefs
> +from pyverbs.base import PyverbsRDMAErrno
> +from pyverbs.mr cimport DmaBufMR
> +
> +cdef extern from "dmabuf_alloc.h":
> +    cdef struct dmabuf:
> +        pass
> +    dmabuf *dmabuf_alloc(int unit, unsigned long size)
> +    void dmabuf_free(dmabuf *dmabuf)
> +    int dmabuf_get_dri_fd(dmabuf *dmabuf)
> +    int dmabuf_get_fd(dmabuf *dmabuf)
> +    unsigned long dmabuf_get_offset(dmabuf *dmabuf)
> +
> +
> +cdef class DmaBuf:
> +    def __init__(self, size, unit=0):
> +        """
> +        Allocate DmaBuf object from a GPU device. This is done through the
> +        DRI device interface. Usually this requires the effective user id
> +        being a member of the 'render' group.
> +        :param size: The size (in number of bytes) of the buffer.
> +        :param unit: The unit number of the GPU to allocate the buffer from.
> +        :return: The newly created DmaBuf object on success.
> +        """
> +        self.dmabuf_mrs = weakref.WeakSet()
> +        self.dmabuf = dmabuf_alloc(size, unit)
> +        if self.dmabuf == NULL:
> +            raise PyverbsRDMAErrno(f'Failed to allocate dmabuf of size {size} on unit {unit}')
> +        self.dri_fd = dmabuf_get_dri_fd(<dmabuf *>self.dmabuf)

dri_fd seems unused by the tests
> +        self.fd = dmabuf_get_fd(<dmabuf *>self.dmabuf)
> +        self.map_offset = dmabuf_get_offset(<dmabuf *>self.dmabuf)
> +
> +    def __dealloc__(self):
> +        self.close()
> +
> +    cpdef close(self):
> +        if self.dmabuf == NULL:
> +            return None
> +        close_weakrefs([self.dmabuf_mrs])
> +        dmabuf_free(<dmabuf *>self.dmabuf)
> +        self.dmabuf = NULL
> +
> +    cdef add_ref(self, obj):
> +        if isinstance(obj, DmaBufMR):
> +            self.dmabuf_mrs.add(obj)
> +
> +    @property
> +    def dri_fd(self):
> +        return self.dri_fd
> +
> +    @property
> +    def handle(self):
> +        return self.handle
> +
> +    @property
> +    def fd(self):
> +        return self.fd
> +
> +    @property
> +    def size(self):
> +        return self.size
> +
> +    @property
> +    def map_offset(self):
> +        return self.map_offset
> diff --git a/pyverbs/dmabuf_alloc.c b/pyverbs/dmabuf_alloc.c
> new file mode 100644
> index 0000000..b958a3e
> --- /dev/null
> +++ b/pyverbs/dmabuf_alloc.c
> @@ -0,0 +1,296 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright 2020 Intel Corporation. All rights reserved. See COPYING file
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <drm/drm.h>
> +#include <drm/i915_drm.h>
> +#include <drm/amdgpu_drm.h>
> +#include <drm/radeon_drm.h>
> +#include <fcntl.h>
> +#include <sys/ioctl.h>
> +#include "dmabuf_alloc.h"
> +
> +/*
> + * Abstraction of the buffer allocation mechanism using the DRI interface.
> + * The interface is accessed by ioctl() calls over the '/dev/dri/renderD*'
> + * device. Successful access usually requires the effective user id being
> + * in the 'render' group.
> + */
> +
> +struct dri {
> +	int fd;
> +	int (*alloc)(struct dri *dri, uint64_t size, uint32_t *handle);
> +	int (*mmap_offset)(struct dri *dri, uint32_t handle, uint64_t *offset);
> +};
> +
> +static int i915_alloc(struct dri *dri, uint64_t size, uint32_t *handle)
> +{
> +	struct drm_i915_gem_create gem_create = {0};
> +	int err;
> +
> +	gem_create.size = size;
> +	err = ioctl(dri->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
> +	if (err)
> +		return err;
> +
> +	*handle = gem_create.handle;
> +	return 0;
> +}
> +
> +static int amdgpu_alloc(struct dri *dri, size_t size, uint32_t *handle)
> +{
> +	union drm_amdgpu_gem_create gem_create = {{0}};
> +	int err;
> +
> +	gem_create.in.bo_size = size;
> +	gem_create.in.domains = AMDGPU_GEM_DOMAIN_VRAM;

I think you minimally also need domain_flags =
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, otherwise you can end up being
unlucky and the mmap fails.

I think it would also be good to have a separate mode where the buffer is
forced to be in system memory (AMDGPU_GEM_DOMAIN_GTT and additionally
AMDGPU_GEM_CREATE_CPU_GTT_USWC needed). This should be useful for cases
where p2p doesn't work, but we still want to check the dma-buf
functionality.

> +	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &gem_create);
> +	if (err)
> +		return err;
> +
> +	*handle = gem_create.out.handle;
> +	return 0;
> +}
> +
> +static int radeon_alloc(struct dri *dri, size_t size, uint32_t *handle)

Tbh radeon chips are old enough I wouldn't care. Also doesn't support p2p
dma-buf, so always going to be in system memory when you share. Plus you
also need some more flags like I suggested above I think.

> +{
> +	struct drm_radeon_gem_create gem_create = {0};
> +	int err;
> +
> +	gem_create.size = size;
> +	gem_create.initial_domain = RADEON_GEM_DOMAIN_VRAM;
> +	err = ioctl(dri->fd, DRM_IOCTL_RADEON_GEM_CREATE, &gem_create);
> +	if (err)
> +		return err;
> +
> +	*handle = gem_create.handle;
> +	return 0;
> +}
> +
> +static int i915_mmap_offset(struct dri *dri, uint32_t handle, uint64_t *offset)
> +{
> +	struct drm_i915_gem_mmap_gtt gem_mmap = {0};
> +	int err;
> +
> +	gem_mmap.handle = handle;
> +	err = ioctl(dri->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &gem_mmap);
> +	if (err)
> +		return err;
> +
> +	*offset = gem_mmap.offset;
> +	return 0;
> +}
> +
> +static int amdgpu_mmap_offset(struct dri *dri, uint32_t handle
> +			      uint64_t *offset)
> +{
> +	union drm_amdgpu_gem_mmap gem_mmap = {{0}};
> +	int err;
> +
> +	gem_mmap.in.handle = handle;
> +	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &gem_mmap);
> +	if (err)
> +		return err;
> +
> +	*offset = gem_mmap.out.addr_ptr;
> +	return 0;
> +}
> +
> +static int radeon_mmap_offset(struct dri *dri, uint32_t handle,
> +			      uint64_t *offset)
> +{
> +	struct drm_radeon_gem_mmap gem_mmap = {0};
> +	int err;
> +
> +	gem_mmap.handle = handle;
> +	err = ioctl(dri->fd, DRM_IOCTL_RADEON_GEM_MMAP, &gem_mmap);
> +	if (err)
> +		return err;
> +
> +	*offset = gem_mmap.addr_ptr;
> +	return 0;
> +}
> +
> +static struct dri *dri_open(int unit)
> +{
> +	char path[32];
> +	struct drm_version version = {0};
> +	char name[16] = {0};
> +	int err;
> +	struct dri *dri;
> +
> +	dri = malloc(sizeof(*dri));
> +	if (!dri)
> +		return NULL;
> +
> +	sprintf(path, "/dev/dri/renderD%d", unit + 128);
> +
> +	dri->fd = open(path, O_RDWR);
> +	if (dri->fd < 0)
> +		goto out_free;
> +
> +	version.name = name;
> +	version.name_len = 16;
> +	err = ioctl(dri->fd, DRM_IOCTL_VERSION, &version);
> +	if (err)
> +		goto out_close;
> +
> +	if (!strcmp(name, "amdgpu")) {
> +		dri->alloc = amdgpu_alloc;
> +		dri->mmap_offset = amdgpu_mmap_offset;
> +	} else if (!strcmp(name, "i915")) {
> +		dri->alloc = i915_alloc;
> +		dri->mmap_offset = i915_mmap_offset;
> +	} else if (!strcmp(name, "radeon")) {
> +		dri->alloc = radeon_alloc;
> +		dri->mmap_offset = radeon_mmap_offset;
> +	} else {
> +		goto out_close;
> +	}
> +	return dri;
> +
> +out_close:
> +	close(dri->fd);
> +
> +out_free:
> +	free(dri);
> +	return NULL;
> +}
> +
> +static void dri_close(struct dri *dri)
> +{
> +	if (!dri || dri->fd < 0)
> +		return;
> +
> +	close(dri->fd);
> +	free(dri);
> +}
> +
> +static void dri_free_buf(struct dri *dri, uint32_t handle)
> +{
> +	struct drm_gem_close close = {0};
> +
> +	close.handle = handle;
> +	ioctl(dri->fd, DRM_IOCTL_GEM_CLOSE, &close);
> +}
> +
> +static int dri_alloc_buf(struct dri *dri, size_t size, uint32_t *handle, int *fd)
> +{
> +	struct drm_prime_handle prime_handle = {0};
> +	int err;
> +
> +	if (!dri || dri->fd < 0)
> +		return -EINVAL;
> +
> +	err = dri->alloc(dri, size, handle);
> +	if (err)
> +		return err;
> +
> +	prime_handle.handle = *handle;
> +	prime_handle.flags = O_RDWR;
> +	err = ioctl(dri->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle);
> +	if (err) {
> +		dri_free_buf(dri, *handle);
> +		return err;
> +	}
> +
> +	*fd = prime_handle.fd;
> +	return 0;
> +}
> +
> +static int dri_map_buf(struct dri *dri, uint32_t handle, uint64_t *offset)
> +{
> +	if (!dri || dri->fd < 0)
> +		return -EINVAL;
> +
> +	return dri->mmap_offset(dri, handle, offset);
> +}
> +
> +/*
> + * Abstraction of dmabuf object, allocated using the DRI abstraction defined
> + * above.
> + */
> +
> +struct dmabuf {
> +	struct dri *dri;
> +	int fd;
> +	uint32_t handle;
> +	uint64_t map_offset;
> +};
> +
> +struct dmabuf *dmabuf_alloc(uint64_t size, int unit)
> +{
> +	struct dmabuf *dmabuf;
> +	int err;
> +
> +	dmabuf = malloc(sizeof *dmabuf);
> +	if (!dmabuf)
> +		return NULL;
> +
> +	dmabuf->dri = dri_open(unit);
> +	if (!dmabuf->dri)
> +		goto out_free;
> +
> +	err = dri_alloc_buf(dmabuf->dri, size, &dmabuf->handle, &dmabuf->fd);
> +	if (err)
> +		goto out_close;
> +
> +	err = dri_map_buf(dmabuf->dri, dmabuf->handle, &dmabuf->map_offset);
> +	if (err)
> +		goto out_free_buf;
> +
> +	return dmabuf;
> +
> +out_free_buf:
> +	dri_free_buf(dmabuf->dri, dmabuf->handle);
> +
> +out_close:
> +	dri_close(dmabuf->dri);
> +
> +out_free:
> +	free(dmabuf);
> +	return NULL;
> +}
> +
> +void dmabuf_free(struct dmabuf *dmabuf)
> +{
> +	if (!dmabuf)
> +		return;
> +
> +	close(dmabuf->fd);
> +	dri_free_buf(dmabuf->dri, dmabuf->handle);
> +	dri_close(dmabuf->dri);
> +	free(dmabuf);
> +}
> +
> +int dmabuf_get_dri_fd(struct dmabuf *dmabuf)
> +{
> +	if (!dmabuf || !dmabuf->dri)
> +		return -1;
> +
> +	return dmabuf->dri->fd;
> +}
> +
> +int dmabuf_get_fd(struct dmabuf *dmabuf)
> +{
> +	if (!dmabuf)
> +		return -1;
> +
> +	return dmabuf->fd;
> +}
> +
> +uint64_t dmabuf_get_offset(struct dmabuf *dmabuf)
> +{
> +	if (!dmabuf)
> +		return -1;
> +
> +	return dmabuf->map_offset;
> +}
> +
> diff --git a/pyverbs/dmabuf_alloc.h b/pyverbs/dmabuf_alloc.h
> new file mode 100644
> index 0000000..f36c337
> --- /dev/null
> +++ b/pyverbs/dmabuf_alloc.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> +/*
> + * Copyright 2020 Intel Corporation. All rights reserved. See COPYING file
> + */
> +
> +#ifndef _DRI_H_
> +#define _DRI_H_
> +
> +#include <stdint.h>
> +
> +struct dmabuf;
> +
> +struct dmabuf *dmabuf_alloc(uint64_t size, int unit);
> +void dmabuf_free(struct dmabuf *dmabuf);
> +int dmabuf_get_dri_fd(struct dmabuf *dmabuf);
> +int dmabuf_get_fd(struct dmabuf *dmabuf);
> +uint64_t dmabuf_get_offset(struct dmabuf *dmabuf);
> +
> +#endif /* _DRI_H_ */
> diff --git a/pyverbs/libibverbs.pxd b/pyverbs/libibverbs.pxd
> index 6fbba54..d76f633 100644
> --- a/pyverbs/libibverbs.pxd
> +++ b/pyverbs/libibverbs.pxd
> @@ -507,6 +507,8 @@ cdef extern from 'infiniband/verbs.h':
>      ibv_pd *ibv_alloc_pd(ibv_context *context)
>      int ibv_dealloc_pd(ibv_pd *pd)
>      ibv_mr *ibv_reg_mr(ibv_pd *pd, void *addr, size_t length, int access)
> +    ibv_mr *ibv_reg_dmabuf_mr(ibv_pd *pd, uint64_t offset, size_t length,
> +                              uint64_t iova, int fd, int access)
>      int ibv_dereg_mr(ibv_mr *mr)
>      int ibv_advise_mr(ibv_pd *pd, uint32_t advice, uint32_t flags,
>                        ibv_sge *sg_list, uint32_t num_sge)
> diff --git a/pyverbs/mr.pxd b/pyverbs/mr.pxd
> index ebe8ada..d9a79ff 100644
> --- a/pyverbs/mr.pxd
> +++ b/pyverbs/mr.pxd
> @@ -1,5 +1,6 @@
>  # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
>  # Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
> +# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
>  
>  #cython: language_level=3
>  
> @@ -33,3 +34,8 @@ cdef class MW(PyverbsCM):
>  
>  cdef class DMMR(MR):
>      cdef object dm
> +
> +cdef class DmaBufMR(MR):
> +    cdef object dmabuf
> +    cdef unsigned long offset
> +    cdef object is_dmabuf_internal
> diff --git a/pyverbs/mr.pyx b/pyverbs/mr.pyx
> index 7011da1..e4ed2dc 100644
> --- a/pyverbs/mr.pyx
> +++ b/pyverbs/mr.pyx
> @@ -1,11 +1,12 @@
>  # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
>  # Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
> +# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
>  
>  import resource
>  import logging
>  
>  from posix.mman cimport mmap, munmap, MAP_PRIVATE, PROT_READ, PROT_WRITE, \
> -    MAP_ANONYMOUS, MAP_HUGETLB
> +    MAP_ANONYMOUS, MAP_HUGETLB, MAP_SHARED
>  from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError, \
>      PyverbsUserError
>  from libc.stdint cimport uintptr_t, SIZE_MAX
> @@ -14,9 +15,10 @@ from posix.stdlib cimport posix_memalign
>  from libc.string cimport memcpy, memset
>  cimport pyverbs.libibverbs_enums as e
>  from pyverbs.device cimport DM
> -from libc.stdlib cimport free
> +from libc.stdlib cimport free, malloc
>  from .cmid cimport CMID
>  from .pd cimport PD
> +from .dmabuf cimport DmaBuf
>  
>  cdef extern from 'sys/mman.h':
>      cdef void* MAP_FAILED
> @@ -348,6 +350,103 @@ cdef class DMMR(MR):
>      cpdef read(self, length, offset):
>          return self.dm.copy_from_dm(offset, length)
>  
> +cdef class DmaBufMR(MR):
> +    def __init__(self, PD pd not None, length, access, DmaBuf dmabuf=None,
> +                 offset=0):
> +        """
> +        Initializes a DmaBufMR (DMA-BUF Memory Region) of the given length
> +        and access flags using the given PD and DmaBuf objects.
> +        :param pd: A PD object
> +        :param length: Length in bytes
> +        :param access: Access flags, see ibv_access_flags enum
> +        :param dmabuf: A DmaBuf object. One will be allocated if absent
> +        :param offset: Byte offset from the beginning of the dma-buf
> +        :return: The newly created DMABUFMR
> +        """
> +        self.logger = logging.getLogger(self.__class__.__name__)
> +        if dmabuf is None:
> +            self.is_dmabuf_internal = True
> +            dmabuf = DmaBuf(length + offset)
> +        self.mr = v.ibv_reg_dmabuf_mr(pd.pd, offset, length, offset, dmabuf.fd, access)
> +        if self.mr == NULL:
> +            raise PyverbsRDMAErrno(f'Failed to register a dma-buf MR. length: {length}, access flags: {access}')
> +        super().__init__(pd, length, access)
> +        self.pd = pd
> +        self.dmabuf = dmabuf
> +        self.offset = offset
> +        pd.add_ref(self)
> +        dmabuf.add_ref(self)
> +        self.logger.debug(f'Registered dma-buf ibv_mr. Length: {length}, access flags {access}')
> +
> +    def __dealloc__(self):
> +        self.close()
> +
> +    cpdef close(self):
> +        """
> +        Closes the underlying C object of the MR and frees the memory allocated.
> +        :return: None
> +        """
> +        if self.mr != NULL:
> +            self.logger.debug('Closing dma-buf MR')
> +            rc = v.ibv_dereg_mr(self.mr)
> +            if rc != 0:
> +                raise PyverbsRDMAError('Failed to dereg dma-buf MR', rc)
> +            self.pd = None
> +            self.mr = NULL
> +            # Set self.mr to NULL before closing dmabuf because this method is
> +            # re-entered when close_weakrefs() is called inside dmabuf.close().
> +            if self.is_dmabuf_internal:
> +                self.dmabuf.close()
> +            self.dmabuf = None
> +
> +    @property
> +    def offset(self):
> +        return self.offset
> +
> +    @property
> +    def dmabuf(self):
> +        return self.dmabuf
> +
> +    def write(self, data, length, offset=0):
> +        """
> +        Write user data to the dma-buf backing the MR
> +        :param data: User data to write
> +        :param length: Length of the data to write
> +        :param offset: Writing offset
> +        :return: None
> +        """
> +        if isinstance(data, str):
> +            data = data.encode()
> +        cdef int off = offset + self.offset
> +        cdef void *buf = mmap(NULL, length + off, PROT_READ | PROT_WRITE,
> +                              MAP_SHARED, self.dmabuf.dri_fd,
> +                              self.dmabuf.map_offset)
> +        if buf == MAP_FAILED:
> +            raise PyverbsError(f'Failed to map dma-buf of size {length}')
> +        memcpy(<char*>(buf + off), <char *>data, length)
> +        munmap(buf, length + off)
> +
> +    cpdef read(self, length, offset):

Note reads are generally uncached so really slow. Maybe put that as a
warning somewhere.

> +        """
> +        Reads data from the dma-buf backing the MR
> +        :param length: Length of data to read
> +        :param offset: Reading offset
> +        :return: The data on the buffer in the requested offset
> +        """
> +        cdef int off = offset + self.offset
> +        cdef void *buf = mmap(NULL, length + off, PROT_READ | PROT_WRITE,
> +                              MAP_SHARED, self.dmabuf.dri_fd,
> +                              self.dmabuf.map_offset)
> +        if buf == MAP_FAILED:
> +            raise PyverbsError(f'Failed to map dma-buf of size {length}')
> +        cdef char *data =<char*>malloc(length)
> +        memset(data, 0, length)
> +        memcpy(data, <char*>(buf + off), length)
> +        munmap(buf, length + off)
> +        res = data[:length]
> +        free(data)
> +        return res
> +
>  
>  def mwtype2str(mw_type):
>      mw_types = {1:'IBV_MW_TYPE_1', 2:'IBV_MW_TYPE_2'}

gpu side looks reasonable.

One bikeshed maybe: Kernel gpu drivers are drm (for direct rendering
manager). DRI is the X11 protocols to support glx direct rendering (i.e.
it's direct rendering infrastructure). devnodes being put into dri is an
unfortunate historical accident. I'd rename all the dri_ to drm_ for
consistency with other drm users, e.g. libdrm.

Cheers, Daniel
> -- 
> 1.8.3.1
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Jason Gunthorpe Nov. 30, 2020, 3:55 p.m. UTC | #2
On Mon, Nov 30, 2020 at 03:57:41PM +0100, Daniel Vetter wrote:
> > +	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &gem_create);
> > +	if (err)
> > +		return err;
> > +
> > +	*handle = gem_create.out.handle;
> > +	return 0;
> > +}
> > +
> > +static int radeon_alloc(struct dri *dri, size_t size, uint32_t *handle)
> 
> Tbh radeon chips are old enough I wouldn't care. Also doesn't support p2p
> dma-buf, so always going to be in system memory when you share. Plus you
> also need some more flags like I suggested above I think.

What about nouveau?

Jason
Daniel Vetter Nov. 30, 2020, 4:04 p.m. UTC | #3
On Mon, Nov 30, 2020 at 11:55:44AM -0400, Jason Gunthorpe wrote:
> On Mon, Nov 30, 2020 at 03:57:41PM +0100, Daniel Vetter wrote:
> > > +	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &gem_create);
> > > +	if (err)
> > > +		return err;
> > > +
> > > +	*handle = gem_create.out.handle;
> > > +	return 0;
> > > +}
> > > +
> > > +static int radeon_alloc(struct dri *dri, size_t size, uint32_t *handle)
> > 
> > Tbh radeon chips are old enough I wouldn't care. Also doesn't support p2p
> > dma-buf, so always going to be in system memory when you share. Plus you
> > also need some more flags like I suggested above I think.
> 
> What about nouveau?

Reallistically chances that someone wants to use rdma together with the
upstream nouveau driver are roughly nil. Imo also needs someone with the
right hardware to make sure it works (since the flags are all kinda arcane
driver specific stuff testing is really needed).
-Daniel
Jason Gunthorpe Nov. 30, 2020, 4:08 p.m. UTC | #4
On Fri, Nov 27, 2020 at 12:55:41PM -0800, Jianxin Xiong wrote:
>  
> +function(rdma_multifile_module PY_MODULE MODULE_NAME LINKER_FLAGS)

I think just replace rdma_cython_module with this? No good reason I
can see to have two APIs?

> +  set(ALL_CFILES "")
> +  foreach(SRC_FILE ${ARGN})
> +    get_filename_component(FILENAME ${SRC_FILE} NAME_WE)
> +    get_filename_component(DIR ${SRC_FILE} DIRECTORY)
> +    get_filename_component(EXT ${SRC_FILE} EXT)
> +    if (DIR)
> +      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${DIR}")
> +    else()
> +      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
> +    endif()
> +    if (${EXT} STREQUAL ".pyx")
> +      set(PYX "${SRC_PATH}/${FILENAME}.pyx")
> +      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
> +      include_directories(${PYTHON_INCLUDE_DIRS})
> +      add_custom_command(
> +        OUTPUT "${CFILE}"
> +        MAIN_DEPENDENCY "${PYX}"
> +        COMMAND ${CYTHON_EXECUTABLE} "${PYX}" -o "${CFILE}"
> +        "-I${PYTHON_INCLUDE_DIRS}"
> +        COMMENT "Cythonizing ${PYX}"
> +      )
> +      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
> +    elseif(${EXT} STREQUAL ".c")
> +      set(CFILE_ORIG "${SRC_PATH}/${FILENAME}.c")
> +      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
> +      if (NOT ${CFILE_ORIG} STREQUAL ${CFILE})
> +        rdma_create_symlink("${CFILE_ORIG}" "${CFILE}")
> +      endif()

Why does this need the create_symlink? The compiler should work OK
from the source file?

> +      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
> +    elseif(${EXT} STREQUAL ".h")
> +      set(HFILE_ORIG "${SRC_PATH}/${FILENAME}.h")
> +      set(HFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.h")
> +      if (NOT ${HFILE_ORIG} STREQUAL ${HFILE})
> +        rdma_create_symlink("${HFILE_ORIG}" "${HFILE}")

Here too? You probably don't need to specify h files at all, at worst
they should only be used with publish_internal_headers

> +      endif()
> +    else()
> +      continue()
> +    endif()
> +  endforeach()
> +  string(REGEX REPLACE "\\.so$" "" SONAME "${MODULE_NAME}${CMAKE_PYTHON_SO_SUFFIX}")
> +  add_library(${SONAME} SHARED ${ALL_CFILES})
> +  set_target_properties(${SONAME} PROPERTIES
> +    COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -fno-strict-aliasing -Wno-unused-function -Wno-redundant-decls -Wno-shadow -Wno-cast-function-type -Wno-implicit-fallthrough -Wno-unknown-warning -Wno-unknown-warning-option -Wno-deprecated-declarations ${NO_VAR_TRACKING_FLAGS}"

Ugh, you copy and pasted this, but it shouldn't have existed in the
first place. Compiler arguments like this should not be specified
manually. I should fix it..

Also you should cc edward on all this pyverbs stuff, he knows it all
very well

It all looks reasonable to me

Jason
Jason Gunthorpe Nov. 30, 2020, 4:36 p.m. UTC | #5
On Mon, Nov 30, 2020 at 05:04:43PM +0100, Daniel Vetter wrote:
> On Mon, Nov 30, 2020 at 11:55:44AM -0400, Jason Gunthorpe wrote:
> > On Mon, Nov 30, 2020 at 03:57:41PM +0100, Daniel Vetter wrote:
> > > > +	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &gem_create);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > +	*handle = gem_create.out.handle;
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int radeon_alloc(struct dri *dri, size_t size, uint32_t *handle)
> > > 
> > > Tbh radeon chips are old enough I wouldn't care. Also doesn't support p2p
> > > dma-buf, so always going to be in system memory when you share. Plus you
> > > also need some more flags like I suggested above I think.
> > 
> > What about nouveau?
> 
> Reallistically chances that someone wants to use rdma together with the
> upstream nouveau driver are roughly nil. Imo also needs someone with the
> right hardware to make sure it works (since the flags are all kinda arcane
> driver specific stuff testing is really needed).

Well, it would be helpful if we can test the mlx5 part of the
implementation, and I have a lab stocked with nouveau compatible HW..

But you are right someone needs to test/etc, so this does not seem
like Jianxin should worry

Jason
Daniel Vetter Nov. 30, 2020, 4:55 p.m. UTC | #6
On Mon, Nov 30, 2020 at 12:36:42PM -0400, Jason Gunthorpe wrote:
> On Mon, Nov 30, 2020 at 05:04:43PM +0100, Daniel Vetter wrote:
> > On Mon, Nov 30, 2020 at 11:55:44AM -0400, Jason Gunthorpe wrote:
> > > On Mon, Nov 30, 2020 at 03:57:41PM +0100, Daniel Vetter wrote:
> > > > > +	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &gem_create);
> > > > > +	if (err)
> > > > > +		return err;
> > > > > +
> > > > > +	*handle = gem_create.out.handle;
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int radeon_alloc(struct dri *dri, size_t size, uint32_t *handle)
> > > > 
> > > > Tbh radeon chips are old enough I wouldn't care. Also doesn't support p2p
> > > > dma-buf, so always going to be in system memory when you share. Plus you
> > > > also need some more flags like I suggested above I think.
> > > 
> > > What about nouveau?
> > 
> > Reallistically chances that someone wants to use rdma together with the
> > upstream nouveau driver are roughly nil. Imo also needs someone with the
> > right hardware to make sure it works (since the flags are all kinda arcane
> > driver specific stuff testing is really needed).
> 
> Well, it would be helpful if we can test the mlx5 part of the
> implementation, and I have a lab stocked with nouveau compatible HW..
> 
> But you are right someone needs to test/etc, so this does not seem
> like Jianxin should worry

Ah yes sounds good. I can help with trying to find how to allocate vram
with nouveau if you don't find it. Caveat is that nouveau doesn't do
dynamic dma-buf exports and hence none of the intersting flows and also
not p2p. Not sure how much work it would be to roll that out (iirc it
wasnt that much amdgpu code really, just endless discussions on the
interface semantics and how to roll it out without breaking any of the
existing dma-buf users).

Another thing that just crossed my mind: Do we have a testcase for forcing
the eviction? Should be fairly easy to provoke with something like this:

- register vram-only buffer with mlx5 and do something that binds it
- allocate enough vram-only buffers to overfill vram (again figuring out
  how much vram you have is driver specific)
- touch each buffer with mmap. that should force the mlx5 buffer out. it
  might be that eviction isn't lru but preferentially idle buffers (i.e.
  not used by hw, so anything register to mlx5 won't qualify as first
  victims). so we might need to instead register a ton of buffers with
  mlx5 and access them through ibverbs
- do something with mlx5 again to force the rebinding and test it all
  keeps working

That entire invalidate/buffer move flow is the most complex interaction I
think.
-Daniel
Xiong, Jianxin Nov. 30, 2020, 5:53 p.m. UTC | #7
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Monday, November 30, 2020 8:08 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [PATCH rdma-core v3 4/6] pyverbs: Add dma-buf based MR support
> 
> On Fri, Nov 27, 2020 at 12:55:41PM -0800, Jianxin Xiong wrote:
> >
> > +function(rdma_multifile_module PY_MODULE MODULE_NAME LINKER_FLAGS)
> 
> I think just replace rdma_cython_module with this? No good reason I can see to have two APIs?

rdma_cython_module can handle many modules, but this one is for a single module.
If you agree, I can merge the two by slightly tweaking the logic: each module starts 
with a .pyx file, followed by 0 or more .c and .h files.

> 
> > +  set(ALL_CFILES "")
> > +  foreach(SRC_FILE ${ARGN})
> > +    get_filename_component(FILENAME ${SRC_FILE} NAME_WE)
> > +    get_filename_component(DIR ${SRC_FILE} DIRECTORY)
> > +    get_filename_component(EXT ${SRC_FILE} EXT)
> > +    if (DIR)
> > +      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${DIR}")
> > +    else()
> > +      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
> > +    endif()
> > +    if (${EXT} STREQUAL ".pyx")
> > +      set(PYX "${SRC_PATH}/${FILENAME}.pyx")
> > +      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
> > +      include_directories(${PYTHON_INCLUDE_DIRS})
> > +      add_custom_command(
> > +        OUTPUT "${CFILE}"
> > +        MAIN_DEPENDENCY "${PYX}"
> > +        COMMAND ${CYTHON_EXECUTABLE} "${PYX}" -o "${CFILE}"
> > +        "-I${PYTHON_INCLUDE_DIRS}"
> > +        COMMENT "Cythonizing ${PYX}"
> > +      )
> > +      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
> > +    elseif(${EXT} STREQUAL ".c")
> > +      set(CFILE_ORIG "${SRC_PATH}/${FILENAME}.c")
> > +      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
> > +      if (NOT ${CFILE_ORIG} STREQUAL ${CFILE})
> > +        rdma_create_symlink("${CFILE_ORIG}" "${CFILE}")
> > +      endif()
> 
> Why does this need the create_symlink? The compiler should work OK from the source file?

You are right, the link for .c is not necessary, but the link for .h is needed.

> 
> > +      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
> > +    elseif(${EXT} STREQUAL ".h")
> > +      set(HFILE_ORIG "${SRC_PATH}/${FILENAME}.h")
> > +      set(HFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.h")
> > +      if (NOT ${HFILE_ORIG} STREQUAL ${HFILE})
> > +        rdma_create_symlink("${HFILE_ORIG}" "${HFILE}")
> 
> Here too? You probably don't need to specify h files at all, at worst they should only be used with publish_internal_headers

Without the .h link, the compiler fail to find the header file (both dmabuf_alloc.c and the generated "dmabuf.c" contain #include "dmabuf_alloc.h").

> 
> > +      endif()
> > +    else()
> > +      continue()
> > +    endif()
> > +  endforeach()
> > +  string(REGEX REPLACE "\\.so$" "" SONAME
> > + "${MODULE_NAME}${CMAKE_PYTHON_SO_SUFFIX}")
> > +  add_library(${SONAME} SHARED ${ALL_CFILES})
> > + set_target_properties(${SONAME} PROPERTIES
> > +    COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -fno-strict-aliasing -Wno-unused-function -Wno-redundant-decls -Wno-shadow -Wno-
> cast-function-type -Wno-implicit-fallthrough -Wno-unknown-warning -Wno-unknown-warning-option -Wno-deprecated-declarations
> ${NO_VAR_TRACKING_FLAGS}"
> 
> Ugh, you copy and pasted this, but it shouldn't have existed in the first place. Compiler arguments like this should not be specified manually.
> I should fix it..
> 
> Also you should cc edward on all this pyverbs stuff, he knows it all very well

Will add Edward next time. He commented a lot on the PR at github. The current github PR
is in sync with this version.

> 
> It all looks reasonable to me
> 
> Jason
Xiong, Jianxin Nov. 30, 2020, 6:03 p.m. UTC | #8
> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Monday, November 30, 2020 8:56 AM
> To: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Daniel Vetter <daniel@ffwll.ch>; Xiong, Jianxin <jianxin.xiong@intel.com>; linux-rdma@vger.kernel.org; dri-
> devel@lists.freedesktop.org; Leon Romanovsky <leon@kernel.org>; Doug Ledford <dledford@redhat.com>; Vetter, Daniel
> <daniel.vetter@intel.com>; Christian Koenig <christian.koenig@amd.com>
> Subject: Re: [PATCH rdma-core v3 4/6] pyverbs: Add dma-buf based MR support
> 
> On Mon, Nov 30, 2020 at 12:36:42PM -0400, Jason Gunthorpe wrote:
> > On Mon, Nov 30, 2020 at 05:04:43PM +0100, Daniel Vetter wrote:
> > > On Mon, Nov 30, 2020 at 11:55:44AM -0400, Jason Gunthorpe wrote:
> > > > On Mon, Nov 30, 2020 at 03:57:41PM +0100, Daniel Vetter wrote:
> > > > > > +	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &gem_create);
> > > > > > +	if (err)
> > > > > > +		return err;
> > > > > > +
> > > > > > +	*handle = gem_create.out.handle;
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int radeon_alloc(struct dri *dri, size_t size,
> > > > > > +uint32_t *handle)
> > > > >
> > > > > Tbh radeon chips are old enough I wouldn't care. Also doesn't
> > > > > support p2p dma-buf, so always going to be in system memory when
> > > > > you share. Plus you also need some more flags like I suggested above I think.
> > > >
> > > > What about nouveau?
> > >
> > > Reallistically chances that someone wants to use rdma together with
> > > the upstream nouveau driver are roughly nil. Imo also needs someone
> > > with the right hardware to make sure it works (since the flags are
> > > all kinda arcane driver specific stuff testing is really needed).
> >
> > Well, it would be helpful if we can test the mlx5 part of the
> > implementation, and I have a lab stocked with nouveau compatible HW..
> >
> > But you are right someone needs to test/etc, so this does not seem
> > like Jianxin should worry
> 
> Ah yes sounds good. I can help with trying to find how to allocate vram with nouveau if you don't find it. Caveat is that nouveau doesn't do
> dynamic dma-buf exports and hence none of the intersting flows and also not p2p. Not sure how much work it would be to roll that out (iirc
> it wasnt that much amdgpu code really, just endless discussions on the interface semantics and how to roll it out without breaking any of
> the existing dma-buf users).
> 
> Another thing that just crossed my mind: Do we have a testcase for forcing the eviction? Should be fairly easy to provoke with something
> like this:
> 
> - register vram-only buffer with mlx5 and do something that binds it
> - allocate enough vram-only buffers to overfill vram (again figuring out
>   how much vram you have is driver specific)
> - touch each buffer with mmap. that should force the mlx5 buffer out. it
>   might be that eviction isn't lru but preferentially idle buffers (i.e.
>   not used by hw, so anything register to mlx5 won't qualify as first
>   victims). so we might need to instead register a ton of buffers with
>   mlx5 and access them through ibverbs
> - do something with mlx5 again to force the rebinding and test it all
>   keeps working
> 
> That entire invalidate/buffer move flow is the most complex interaction I think.

Right now on my side the evict scenario is tested with the "timeout" feature of the
AMD gpu. The GPU driver would move all VRAM allocations to system buffer after
a certain period of "inactivity" (10s by default). VRAM being accessed by peer DMA
is not counted as activity from GPU's POV. I can observe the invalidation/remapping
sequence by running an RDMA test for long enough time. 

I agree having a more generic mechanism to force this scenario is going to be useful.

> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Xiong, Jianxin Nov. 30, 2020, 6:13 p.m. UTC | #9
> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Monday, November 30, 2020 6:58 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Leon Romanovsky <leon@kernel.org>; Jason Gunthorpe <jgg@ziepe.ca>;
> Doug Ledford <dledford@redhat.com>; Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig <christian.koenig@amd.com>
> Subject: Re: [PATCH rdma-core v3 4/6] pyverbs: Add dma-buf based MR support
> 
> > +cdef class DmaBuf:
> > +    def __init__(self, size, unit=0):
> > +        """
> > +        Allocate DmaBuf object from a GPU device. This is done through the
> > +        DRI device interface. Usually this requires the effective user id
> > +        being a member of the 'render' group.
> > +        :param size: The size (in number of bytes) of the buffer.
> > +        :param unit: The unit number of the GPU to allocate the buffer from.
> > +        :return: The newly created DmaBuf object on success.
> > +        """
> > +        self.dmabuf_mrs = weakref.WeakSet()
> > +        self.dmabuf = dmabuf_alloc(size, unit)
> > +        if self.dmabuf == NULL:
> > +            raise PyverbsRDMAErrno(f'Failed to allocate dmabuf of size {size} on unit {unit}')
> > +        self.dri_fd = dmabuf_get_dri_fd(<dmabuf *>self.dmabuf)
> 
> dri_fd seems unused by the tests

It's used by the read/write methods of the DmaBufMR class for performing mmap.
Jason Gunthorpe Dec. 2, 2020, 12:39 a.m. UTC | #10
On Mon, Nov 30, 2020 at 05:53:39PM +0000, Xiong, Jianxin wrote:
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Monday, November 30, 2020 8:08 AM
> > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> > <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> > <daniel.vetter@intel.com>
> > Subject: Re: [PATCH rdma-core v3 4/6] pyverbs: Add dma-buf based MR support
> > 
> > On Fri, Nov 27, 2020 at 12:55:41PM -0800, Jianxin Xiong wrote:
> > >
> > > +function(rdma_multifile_module PY_MODULE MODULE_NAME LINKER_FLAGS)
> > 
> > I think just replace rdma_cython_module with this? No good reason I can see to have two APIs?
> 
> rdma_cython_module can handle many modules, but this one is for a single module.
> If you agree, I can merge the two by slightly tweaking the logic: each module starts 
> with a .pyx file, followed by 0 or more .c and .h files.

Then have rdma_cython_module call some rdam_single_cython_module()
multiple times that has this code below?

> > Here too? You probably don't need to specify h files at all, at
> > worst they should only be used with publish_internal_headers
> 
> Without the .h link, the compiler fail to find the header file (both
> dmabuf_alloc.c and the generated "dmabuf.c" contain #include
> "dmabuf_alloc.h").

Header files are made 'cross module' using the
"publish_internal_headers" command

But we could also hack in a -I directive to fix up the "" include for
the cython outupt..

But it should not be handled here in the cython module command

Jason
Xiong, Jianxin Dec. 2, 2020, 1:12 a.m. UTC | #11
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Tuesday, December 01, 2020 4:39 PM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [PATCH rdma-core v3 4/6] pyverbs: Add dma-buf based MR support
> 
> On Mon, Nov 30, 2020 at 05:53:39PM +0000, Xiong, Jianxin wrote:
> > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > Sent: Monday, November 30, 2020 8:08 AM
> > > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org;
> > > Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> > > <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian
> > > Koenig <christian.koenig@amd.com>; Vetter, Daniel
> > > <daniel.vetter@intel.com>
> > > Subject: Re: [PATCH rdma-core v3 4/6] pyverbs: Add dma-buf based MR
> > > support
> > >
> > > On Fri, Nov 27, 2020 at 12:55:41PM -0800, Jianxin Xiong wrote:
> > > >
> > > > +function(rdma_multifile_module PY_MODULE MODULE_NAME
> > > > +LINKER_FLAGS)
> > >
> > > I think just replace rdma_cython_module with this? No good reason I can see to have two APIs?
> >
> > rdma_cython_module can handle many modules, but this one is for a single module.
> > If you agree, I can merge the two by slightly tweaking the logic: each
> > module starts with a .pyx file, followed by 0 or more .c and .h files.
> 
> Then have rdma_cython_module call some rdam_single_cython_module() multiple times that has this code below?

Mostly like that. Here is an outline:

function(build_one_module PY_MODULE MODULE_NAME ALL_CFILES)
    string(REGEX_REPLACE "\\.so$" "" SONAME ${MODULE_NAME}${CMAKE_PYTHON_SO_SUFFIX}")
    add_library(......)
    set_target_properties(......)
    target_link_libraries(......)
    install(......)
endfunction()

function(rdma_cython_module .......)
    foreach(SRC_FILE ${ARGN})
        ...... # commands to parse file name
        If (${EXT} STREQAL ".pyx")
            If (ALL_CFILES AND MODULE_NAME)
                build_one_module(${PY_MODUE} ${MODULE_NAME} ${ALL_CFILES})
                set(ALL_CFILES "")
                set(MODULE_NAME "")
            endif()
            ...... # commands to convert .pyx to .c
            set(ALL_CFILES "${ALL_CFILES};${CFILE}")
        elseif (${EXT} STREQAL ".c")
            ......
            set(ALL_CFILES "${ALL_CFILES};${CFILE}")
        else()
            continue()
        endif()
    endforeach()
    If (ALL_CFILES AND MODULE_NAME)
        build_one_module(${PY_MODULE} ${MODULE_NAME} ${ALL_CFILES})
     endif()
endfunction()

> 
> > > Here too? You probably don't need to specify h files at all, at
> > > worst they should only be used with publish_internal_headers
> >
> > Without the .h link, the compiler fail to find the header file (both
> > dmabuf_alloc.c and the generated "dmabuf.c" contain #include
> > "dmabuf_alloc.h").
> 
> Header files are made 'cross module' using the "publish_internal_headers" command
> 
> But we could also hack in a -I directive to fix up the "" include for the cython outupt..
> 
> But it should not be handled here in the cython module command

Sure. That can be fixed.

> 
> Jason
diff mbox series

Patch

diff --git a/buildlib/pyverbs_functions.cmake b/buildlib/pyverbs_functions.cmake
index 953cec2..2f6788e 100644
--- a/buildlib/pyverbs_functions.cmake
+++ b/buildlib/pyverbs_functions.cmake
@@ -1,5 +1,6 @@ 
 # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
 # Copyright (c) 2018, Mellanox Technologies. All rights reserved.  See COPYING file
+# Copyright (c) 2020, Intel Corporation. All rights reserved.  See COPYING file
 
 function(rdma_cython_module PY_MODULE LINKER_FLAGS)
   foreach(PYX_FILE ${ARGN})
@@ -32,6 +33,57 @@  function(rdma_cython_module PY_MODULE LINKER_FLAGS)
   endforeach()
 endfunction()
 
+function(rdma_multifile_module PY_MODULE MODULE_NAME LINKER_FLAGS)
+  set(ALL_CFILES "")
+  foreach(SRC_FILE ${ARGN})
+    get_filename_component(FILENAME ${SRC_FILE} NAME_WE)
+    get_filename_component(DIR ${SRC_FILE} DIRECTORY)
+    get_filename_component(EXT ${SRC_FILE} EXT)
+    if (DIR)
+      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${DIR}")
+    else()
+      set(SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
+    endif()
+    if (${EXT} STREQUAL ".pyx")
+      set(PYX "${SRC_PATH}/${FILENAME}.pyx")
+      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
+      include_directories(${PYTHON_INCLUDE_DIRS})
+      add_custom_command(
+        OUTPUT "${CFILE}"
+        MAIN_DEPENDENCY "${PYX}"
+        COMMAND ${CYTHON_EXECUTABLE} "${PYX}" -o "${CFILE}"
+        "-I${PYTHON_INCLUDE_DIRS}"
+        COMMENT "Cythonizing ${PYX}"
+      )
+      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
+    elseif(${EXT} STREQUAL ".c")
+      set(CFILE_ORIG "${SRC_PATH}/${FILENAME}.c")
+      set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c")
+      if (NOT ${CFILE_ORIG} STREQUAL ${CFILE})
+        rdma_create_symlink("${CFILE_ORIG}" "${CFILE}")
+      endif()
+      set(ALL_CFILES "${ALL_CFILES};${CFILE}")
+    elseif(${EXT} STREQUAL ".h")
+      set(HFILE_ORIG "${SRC_PATH}/${FILENAME}.h")
+      set(HFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.h")
+      if (NOT ${HFILE_ORIG} STREQUAL ${HFILE})
+        rdma_create_symlink("${HFILE_ORIG}" "${HFILE}")
+      endif()
+    else()
+      continue()
+    endif()
+  endforeach()
+  string(REGEX REPLACE "\\.so$" "" SONAME "${MODULE_NAME}${CMAKE_PYTHON_SO_SUFFIX}")
+  add_library(${SONAME} SHARED ${ALL_CFILES})
+  set_target_properties(${SONAME} PROPERTIES
+    COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -fno-strict-aliasing -Wno-unused-function -Wno-redundant-decls -Wno-shadow -Wno-cast-function-type -Wno-implicit-fallthrough -Wno-unknown-warning -Wno-unknown-warning-option -Wno-deprecated-declarations ${NO_VAR_TRACKING_FLAGS}"
+    LIBRARY_OUTPUT_DIRECTORY "${BUILD_PYTHON}/${PY_MODULE}"
+    PREFIX "")
+  target_link_libraries(${SONAME} LINK_PRIVATE ${PYTHON_LIBRARIES} ibverbs rdmacm ${LINKER_FLAGS})
+  install(TARGETS ${SONAME}
+    DESTINATION ${CMAKE_INSTALL_PYTHON_ARCH_LIB}/${PY_MODULE})
+endfunction()
+
 function(rdma_python_module PY_MODULE)
   foreach(PY_FILE ${ARGN})
     get_filename_component(LINK "${CMAKE_CURRENT_SOURCE_DIR}/${PY_FILE}" ABSOLUTE)
diff --git a/pyverbs/CMakeLists.txt b/pyverbs/CMakeLists.txt
index 9542c4b..1b21e7b 100644
--- a/pyverbs/CMakeLists.txt
+++ b/pyverbs/CMakeLists.txt
@@ -1,5 +1,6 @@ 
 # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
 # Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
+# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
 
 rdma_cython_module(pyverbs ""
   addr.pyx
@@ -24,6 +25,12 @@  rdma_python_module(pyverbs
   utils.py
   )
 
+rdma_multifile_module(pyverbs dmabuf ""
+  dmabuf.pyx
+  dmabuf_alloc.c
+  dmabuf_alloc.h
+  )
+
 # mlx5 and efa providers are not built without coherent DMA, e.g. ARM32 build.
 if (HAVE_COHERENT_DMA)
 add_subdirectory(providers/mlx5)
diff --git a/pyverbs/dmabuf.pxd b/pyverbs/dmabuf.pxd
new file mode 100644
index 0000000..3ef5dfb
--- /dev/null
+++ b/pyverbs/dmabuf.pxd
@@ -0,0 +1,15 @@ 
+# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
+# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
+
+#cython: language_level=3
+
+cdef class DmaBuf:
+    cdef int dri_fd
+    cdef int handle
+    cdef int fd
+    cdef unsigned long size
+    cdef unsigned long map_offset
+    cdef void *dmabuf
+    cdef object dmabuf_mrs
+    cdef add_ref(self, obj)
+    cpdef close(self)
diff --git a/pyverbs/dmabuf.pyx b/pyverbs/dmabuf.pyx
new file mode 100644
index 0000000..23d8e2a
--- /dev/null
+++ b/pyverbs/dmabuf.pyx
@@ -0,0 +1,72 @@ 
+# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
+# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
+
+#cython: language_level=3
+
+import weakref
+
+from pyverbs.base cimport close_weakrefs
+from pyverbs.base import PyverbsRDMAErrno
+from pyverbs.mr cimport DmaBufMR
+
+cdef extern from "dmabuf_alloc.h":
+    cdef struct dmabuf:
+        pass
+    dmabuf *dmabuf_alloc(int unit, unsigned long size)
+    void dmabuf_free(dmabuf *dmabuf)
+    int dmabuf_get_dri_fd(dmabuf *dmabuf)
+    int dmabuf_get_fd(dmabuf *dmabuf)
+    unsigned long dmabuf_get_offset(dmabuf *dmabuf)
+
+
+cdef class DmaBuf:
+    def __init__(self, size, unit=0):
+        """
+        Allocate DmaBuf object from a GPU device. This is done through the
+        DRI device interface. Usually this requires the effective user id
+        being a member of the 'render' group.
+        :param size: The size (in number of bytes) of the buffer.
+        :param unit: The unit number of the GPU to allocate the buffer from.
+        :return: The newly created DmaBuf object on success.
+        """
+        self.dmabuf_mrs = weakref.WeakSet()
+        self.dmabuf = dmabuf_alloc(size, unit)
+        if self.dmabuf == NULL:
+            raise PyverbsRDMAErrno(f'Failed to allocate dmabuf of size {size} on unit {unit}')
+        self.dri_fd = dmabuf_get_dri_fd(<dmabuf *>self.dmabuf)
+        self.fd = dmabuf_get_fd(<dmabuf *>self.dmabuf)
+        self.map_offset = dmabuf_get_offset(<dmabuf *>self.dmabuf)
+
+    def __dealloc__(self):
+        self.close()
+
+    cpdef close(self):
+        if self.dmabuf == NULL:
+            return None
+        close_weakrefs([self.dmabuf_mrs])
+        dmabuf_free(<dmabuf *>self.dmabuf)
+        self.dmabuf = NULL
+
+    cdef add_ref(self, obj):
+        if isinstance(obj, DmaBufMR):
+            self.dmabuf_mrs.add(obj)
+
+    @property
+    def dri_fd(self):
+        return self.dri_fd
+
+    @property
+    def handle(self):
+        return self.handle
+
+    @property
+    def fd(self):
+        return self.fd
+
+    @property
+    def size(self):
+        return self.size
+
+    @property
+    def map_offset(self):
+        return self.map_offset
diff --git a/pyverbs/dmabuf_alloc.c b/pyverbs/dmabuf_alloc.c
new file mode 100644
index 0000000..b958a3e
--- /dev/null
+++ b/pyverbs/dmabuf_alloc.c
@@ -0,0 +1,296 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2020 Intel Corporation. All rights reserved. See COPYING file
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <drm/drm.h>
+#include <drm/i915_drm.h>
+#include <drm/amdgpu_drm.h>
+#include <drm/radeon_drm.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include "dmabuf_alloc.h"
+
+/*
+ * Abstraction of the buffer allocation mechanism using the DRI interface.
+ * The interface is accessed by ioctl() calls over the '/dev/dri/renderD*'
+ * device. Successful access usually requires the effective user id being
+ * in the 'render' group.
+ */
+
+struct dri {
+	int fd;
+	int (*alloc)(struct dri *dri, uint64_t size, uint32_t *handle);
+	int (*mmap_offset)(struct dri *dri, uint32_t handle, uint64_t *offset);
+};
+
+static int i915_alloc(struct dri *dri, uint64_t size, uint32_t *handle)
+{
+	struct drm_i915_gem_create gem_create = {0};
+	int err;
+
+	gem_create.size = size;
+	err = ioctl(dri->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
+	if (err)
+		return err;
+
+	*handle = gem_create.handle;
+	return 0;
+}
+
+static int amdgpu_alloc(struct dri *dri, size_t size, uint32_t *handle)
+{
+	union drm_amdgpu_gem_create gem_create = {{0}};
+	int err;
+
+	gem_create.in.bo_size = size;
+	gem_create.in.domains = AMDGPU_GEM_DOMAIN_VRAM;
+	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &gem_create);
+	if (err)
+		return err;
+
+	*handle = gem_create.out.handle;
+	return 0;
+}
+
+static int radeon_alloc(struct dri *dri, size_t size, uint32_t *handle)
+{
+	struct drm_radeon_gem_create gem_create = {0};
+	int err;
+
+	gem_create.size = size;
+	gem_create.initial_domain = RADEON_GEM_DOMAIN_VRAM;
+	err = ioctl(dri->fd, DRM_IOCTL_RADEON_GEM_CREATE, &gem_create);
+	if (err)
+		return err;
+
+	*handle = gem_create.handle;
+	return 0;
+}
+
+static int i915_mmap_offset(struct dri *dri, uint32_t handle, uint64_t *offset)
+{
+	struct drm_i915_gem_mmap_gtt gem_mmap = {0};
+	int err;
+
+	gem_mmap.handle = handle;
+	err = ioctl(dri->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &gem_mmap);
+	if (err)
+		return err;
+
+	*offset = gem_mmap.offset;
+	return 0;
+}
+
+static int amdgpu_mmap_offset(struct dri *dri, uint32_t handle,
+			      uint64_t *offset)
+{
+	union drm_amdgpu_gem_mmap gem_mmap = {{0}};
+	int err;
+
+	gem_mmap.in.handle = handle;
+	err = ioctl(dri->fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &gem_mmap);
+	if (err)
+		return err;
+
+	*offset = gem_mmap.out.addr_ptr;
+	return 0;
+}
+
+static int radeon_mmap_offset(struct dri *dri, uint32_t handle,
+			      uint64_t *offset)
+{
+	struct drm_radeon_gem_mmap gem_mmap = {0};
+	int err;
+
+	gem_mmap.handle = handle;
+	err = ioctl(dri->fd, DRM_IOCTL_RADEON_GEM_MMAP, &gem_mmap);
+	if (err)
+		return err;
+
+	*offset = gem_mmap.addr_ptr;
+	return 0;
+}
+
+static struct dri *dri_open(int unit)
+{
+	char path[32];
+	struct drm_version version = {0};
+	char name[16] = {0};
+	int err;
+	struct dri *dri;
+
+	dri = malloc(sizeof(*dri));
+	if (!dri)
+		return NULL;
+
+	sprintf(path, "/dev/dri/renderD%d", unit + 128);
+
+	dri->fd = open(path, O_RDWR);
+	if (dri->fd < 0)
+		goto out_free;
+
+	version.name = name;
+	version.name_len = 16;
+	err = ioctl(dri->fd, DRM_IOCTL_VERSION, &version);
+	if (err)
+		goto out_close;
+
+	if (!strcmp(name, "amdgpu")) {
+		dri->alloc = amdgpu_alloc;
+		dri->mmap_offset = amdgpu_mmap_offset;
+	} else if (!strcmp(name, "i915")) {
+		dri->alloc = i915_alloc;
+		dri->mmap_offset = i915_mmap_offset;
+	} else if (!strcmp(name, "radeon")) {
+		dri->alloc = radeon_alloc;
+		dri->mmap_offset = radeon_mmap_offset;
+	} else {
+		goto out_close;
+	}
+	return dri;
+
+out_close:
+	close(dri->fd);
+
+out_free:
+	free(dri);
+	return NULL;
+}
+
+static void dri_close(struct dri *dri)
+{
+	if (!dri || dri->fd < 0)
+		return;
+
+	close(dri->fd);
+	free(dri);
+}
+
+static void dri_free_buf(struct dri *dri, uint32_t handle)
+{
+	struct drm_gem_close close = {0};
+
+	close.handle = handle;
+	ioctl(dri->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static int dri_alloc_buf(struct dri *dri, size_t size, uint32_t *handle, int *fd)
+{
+	struct drm_prime_handle prime_handle = {0};
+	int err;
+
+	if (!dri || dri->fd < 0)
+		return -EINVAL;
+
+	err = dri->alloc(dri, size, handle);
+	if (err)
+		return err;
+
+	prime_handle.handle = *handle;
+	prime_handle.flags = O_RDWR;
+	err = ioctl(dri->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle);
+	if (err) {
+		dri_free_buf(dri, *handle);
+		return err;
+	}
+
+	*fd = prime_handle.fd;
+	return 0;
+}
+
+static int dri_map_buf(struct dri *dri, uint32_t handle, uint64_t *offset)
+{
+	if (!dri || dri->fd < 0)
+		return -EINVAL;
+
+	return dri->mmap_offset(dri, handle, offset);
+}
+
+/*
+ * Abstraction of dmabuf object, allocated using the DRI abstraction defined
+ * above.
+ */
+
+struct dmabuf {
+	struct dri *dri;
+	int fd;
+	uint32_t handle;
+	uint64_t map_offset;
+};
+
+struct dmabuf *dmabuf_alloc(uint64_t size, int unit)
+{
+	struct dmabuf *dmabuf;
+	int err;
+
+	dmabuf = malloc(sizeof *dmabuf);
+	if (!dmabuf)
+		return NULL;
+
+	dmabuf->dri = dri_open(unit);
+	if (!dmabuf->dri)
+		goto out_free;
+
+	err = dri_alloc_buf(dmabuf->dri, size, &dmabuf->handle, &dmabuf->fd);
+	if (err)
+		goto out_close;
+
+	err = dri_map_buf(dmabuf->dri, dmabuf->handle, &dmabuf->map_offset);
+	if (err)
+		goto out_free_buf;
+
+	return dmabuf;
+
+out_free_buf:
+	dri_free_buf(dmabuf->dri, dmabuf->handle);
+
+out_close:
+	dri_close(dmabuf->dri);
+
+out_free:
+	free(dmabuf);
+	return NULL;
+}
+
+void dmabuf_free(struct dmabuf *dmabuf)
+{
+	if (!dmabuf)
+		return;
+
+	close(dmabuf->fd);
+	dri_free_buf(dmabuf->dri, dmabuf->handle);
+	dri_close(dmabuf->dri);
+	free(dmabuf);
+}
+
+int dmabuf_get_dri_fd(struct dmabuf *dmabuf)
+{
+	if (!dmabuf || !dmabuf->dri)
+		return -1;
+
+	return dmabuf->dri->fd;
+}
+
+int dmabuf_get_fd(struct dmabuf *dmabuf)
+{
+	if (!dmabuf)
+		return -1;
+
+	return dmabuf->fd;
+}
+
+uint64_t dmabuf_get_offset(struct dmabuf *dmabuf)
+{
+	if (!dmabuf)
+		return -1;
+
+	return dmabuf->map_offset;
+}
+
diff --git a/pyverbs/dmabuf_alloc.h b/pyverbs/dmabuf_alloc.h
new file mode 100644
index 0000000..f36c337
--- /dev/null
+++ b/pyverbs/dmabuf_alloc.h
@@ -0,0 +1,19 @@ 
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright 2020 Intel Corporation. All rights reserved. See COPYING file
+ */
+
+#ifndef _DRI_H_
+#define _DRI_H_
+
+#include <stdint.h>
+
+struct dmabuf;
+
+struct dmabuf *dmabuf_alloc(uint64_t size, int unit);
+void dmabuf_free(struct dmabuf *dmabuf);
+int dmabuf_get_dri_fd(struct dmabuf *dmabuf);
+int dmabuf_get_fd(struct dmabuf *dmabuf);
+uint64_t dmabuf_get_offset(struct dmabuf *dmabuf);
+
+#endif /* _DRI_H_ */
diff --git a/pyverbs/libibverbs.pxd b/pyverbs/libibverbs.pxd
index 6fbba54..d76f633 100644
--- a/pyverbs/libibverbs.pxd
+++ b/pyverbs/libibverbs.pxd
@@ -507,6 +507,8 @@  cdef extern from 'infiniband/verbs.h':
     ibv_pd *ibv_alloc_pd(ibv_context *context)
     int ibv_dealloc_pd(ibv_pd *pd)
     ibv_mr *ibv_reg_mr(ibv_pd *pd, void *addr, size_t length, int access)
+    ibv_mr *ibv_reg_dmabuf_mr(ibv_pd *pd, uint64_t offset, size_t length,
+                              uint64_t iova, int fd, int access)
     int ibv_dereg_mr(ibv_mr *mr)
     int ibv_advise_mr(ibv_pd *pd, uint32_t advice, uint32_t flags,
                       ibv_sge *sg_list, uint32_t num_sge)
diff --git a/pyverbs/mr.pxd b/pyverbs/mr.pxd
index ebe8ada..d9a79ff 100644
--- a/pyverbs/mr.pxd
+++ b/pyverbs/mr.pxd
@@ -1,5 +1,6 @@ 
 # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
 # Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
+# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
 
 #cython: language_level=3
 
@@ -33,3 +34,8 @@  cdef class MW(PyverbsCM):
 
 cdef class DMMR(MR):
     cdef object dm
+
+cdef class DmaBufMR(MR):
+    cdef object dmabuf
+    cdef unsigned long offset
+    cdef object is_dmabuf_internal
diff --git a/pyverbs/mr.pyx b/pyverbs/mr.pyx
index 7011da1..e4ed2dc 100644
--- a/pyverbs/mr.pyx
+++ b/pyverbs/mr.pyx
@@ -1,11 +1,12 @@ 
 # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
 # Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
+# Copyright (c) 2020, Intel Corporation. All rights reserved. See COPYING file
 
 import resource
 import logging
 
 from posix.mman cimport mmap, munmap, MAP_PRIVATE, PROT_READ, PROT_WRITE, \
-    MAP_ANONYMOUS, MAP_HUGETLB
+    MAP_ANONYMOUS, MAP_HUGETLB, MAP_SHARED
 from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError, \
     PyverbsUserError
 from libc.stdint cimport uintptr_t, SIZE_MAX
@@ -14,9 +15,10 @@  from posix.stdlib cimport posix_memalign
 from libc.string cimport memcpy, memset
 cimport pyverbs.libibverbs_enums as e
 from pyverbs.device cimport DM
-from libc.stdlib cimport free
+from libc.stdlib cimport free, malloc
 from .cmid cimport CMID
 from .pd cimport PD
+from .dmabuf cimport DmaBuf
 
 cdef extern from 'sys/mman.h':
     cdef void* MAP_FAILED
@@ -348,6 +350,103 @@  cdef class DMMR(MR):
     cpdef read(self, length, offset):
         return self.dm.copy_from_dm(offset, length)
 
+cdef class DmaBufMR(MR):
+    def __init__(self, PD pd not None, length, access, DmaBuf dmabuf=None,
+                 offset=0):
+        """
+        Initializes a DmaBufMR (DMA-BUF Memory Region) of the given length
+        and access flags using the given PD and DmaBuf objects.
+        :param pd: A PD object
+        :param length: Length in bytes
+        :param access: Access flags, see ibv_access_flags enum
+        :param dmabuf: A DmaBuf object. One will be allocated if absent
+        :param offset: Byte offset from the beginning of the dma-buf
+        :return: The newly created DMABUFMR
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if dmabuf is None:
+            self.is_dmabuf_internal = True
+            dmabuf = DmaBuf(length + offset)
+        self.mr = v.ibv_reg_dmabuf_mr(pd.pd, offset, length, offset, dmabuf.fd, access)
+        if self.mr == NULL:
+            raise PyverbsRDMAErrno(f'Failed to register a dma-buf MR. length: {length}, access flags: {access}')
+        super().__init__(pd, length, access)
+        self.pd = pd
+        self.dmabuf = dmabuf
+        self.offset = offset
+        pd.add_ref(self)
+        dmabuf.add_ref(self)
+        self.logger.debug(f'Registered dma-buf ibv_mr. Length: {length}, access flags {access}')
+
+    def __dealloc__(self):
+        self.close()
+
+    cpdef close(self):
+        """
+        Closes the underlying C object of the MR and frees the memory allocated.
+        :return: None
+        """
+        if self.mr != NULL:
+            self.logger.debug('Closing dma-buf MR')
+            rc = v.ibv_dereg_mr(self.mr)
+            if rc != 0:
+                raise PyverbsRDMAError('Failed to dereg dma-buf MR', rc)
+            self.pd = None
+            self.mr = NULL
+            # Set self.mr to NULL before closing dmabuf because this method is
+            # re-entered when close_weakrefs() is called inside dmabuf.close().
+            if self.is_dmabuf_internal:
+                self.dmabuf.close()
+            self.dmabuf = None
+
+    @property
+    def offset(self):
+        return self.offset
+
+    @property
+    def dmabuf(self):
+        return self.dmabuf
+
+    def write(self, data, length, offset=0):
+        """
+        Write user data to the dma-buf backing the MR
+        :param data: User data to write
+        :param length: Length of the data to write
+        :param offset: Writing offset
+        :return: None
+        """
+        if isinstance(data, str):
+            data = data.encode()
+        cdef int off = offset + self.offset
+        cdef void *buf = mmap(NULL, length + off, PROT_READ | PROT_WRITE,
+                              MAP_SHARED, self.dmabuf.dri_fd,
+                              self.dmabuf.map_offset)
+        if buf == MAP_FAILED:
+            raise PyverbsError(f'Failed to map dma-buf of size {length}')
+        memcpy(<char*>(buf + off), <char *>data, length)
+        munmap(buf, length + off)
+
+    cpdef read(self, length, offset):
+        """
+        Reads data from the dma-buf backing the MR
+        :param length: Length of data to read
+        :param offset: Reading offset
+        :return: The data on the buffer in the requested offset
+        """
+        cdef int off = offset + self.offset
+        cdef void *buf = mmap(NULL, length + off, PROT_READ | PROT_WRITE,
+                              MAP_SHARED, self.dmabuf.dri_fd,
+                              self.dmabuf.map_offset)
+        if buf == MAP_FAILED:
+            raise PyverbsError(f'Failed to map dma-buf of size {length}')
+        cdef char *data =<char*>malloc(length)
+        memset(data, 0, length)
+        memcpy(data, <char*>(buf + off), length)
+        munmap(buf, length + off)
+        res = data[:length]
+        free(data)
+        return res
+
 
 def mwtype2str(mw_type):
     mw_types = {1:'IBV_MW_TYPE_1', 2:'IBV_MW_TYPE_2'}