diff mbox series

[v2,1/2] migration/rdma: Try to register On-Demand Paging memory region

Message ID 20210823033358.3002-2-lizhijian@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show
Series enable fsdax rdma migration | expand

Commit Message

Li Zhijian Aug. 23, 2021, 3:33 a.m. UTC
Previously, for the fsdax mem-backend-file, it will register failed with
Operation not supported. In this case, we can try to register it with
On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].

[1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
[2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3

CC: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>

---
V2: add ODP sanity check and remove goto
---
 migration/rdma.c       | 73 ++++++++++++++++++++++++++++++------------
 migration/trace-events |  1 +
 2 files changed, 54 insertions(+), 20 deletions(-)

Comments

Zhijian Li (Fujitsu) Aug. 23, 2021, 8:42 a.m. UTC | #1
CCing  Marcel


On 23/08/2021 11:33, Li Zhijian wrote:
> Previously, for the fsdax mem-backend-file, it will register failed with
> Operation not supported. In this case, we can try to register it with
> On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
>
> [1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
> [2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
>
> CC: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>
> ---
> V2: add ODP sanity check and remove goto
> ---
>   migration/rdma.c       | 73 ++++++++++++++++++++++++++++++------------
>   migration/trace-events |  1 +
>   2 files changed, 54 insertions(+), 20 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 5c2d113aa94..eb80431aae2 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -1117,19 +1117,47 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
>       return 0;
>   }
>   
> +/* Check whether On-Demand Paging is supported by RDAM device */
> +static bool rdma_support_odp(struct ibv_context *dev)
> +{
> +    struct ibv_device_attr_ex attr = {0};
> +    int ret = ibv_query_device_ex(dev, NULL, &attr);
> +    if (ret) {
> +        return false;
> +    }
> +
> +    if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
> +        return true;
> +    }
> +
> +    return false;
> +}
> +
>   static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
>   {
>       int i;
>       RDMALocalBlocks *local = &rdma->local_ram_blocks;
>   
>       for (i = 0; i < local->nb_blocks; i++) {
> +        int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
> +
>           local->block[i].mr =
>               ibv_reg_mr(rdma->pd,
>                       local->block[i].local_host_addr,
> -                    local->block[i].length,
> -                    IBV_ACCESS_LOCAL_WRITE |
> -                    IBV_ACCESS_REMOTE_WRITE
> +                    local->block[i].length, access
>                       );
> +
> +        if (!local->block[i].mr &&
> +            errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> +                access |= IBV_ACCESS_ON_DEMAND;
> +                /* register ODP mr */
> +                local->block[i].mr =
> +                    ibv_reg_mr(rdma->pd,
> +                               local->block[i].local_host_addr,
> +                               local->block[i].length, access);
> +                trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
> +        }
> +
>           if (!local->block[i].mr) {
>               perror("Failed to register local dest ram block!");
>               break;
> @@ -1215,28 +1243,33 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
>        */
>       if (!block->pmr[chunk]) {
>           uint64_t len = chunk_end - chunk_start;
> +        int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
> +                     0;
>   
>           trace_qemu_rdma_register_and_get_keys(len, chunk_start);
>   
> -        block->pmr[chunk] = ibv_reg_mr(rdma->pd,
> -                chunk_start, len,
> -                (rkey ? (IBV_ACCESS_LOCAL_WRITE |
> -                        IBV_ACCESS_REMOTE_WRITE) : 0));
> -
> -        if (!block->pmr[chunk]) {
> -            perror("Failed to register chunk!");
> -            fprintf(stderr, "Chunk details: block: %d chunk index %d"
> -                            " start %" PRIuPTR " end %" PRIuPTR
> -                            " host %" PRIuPTR
> -                            " local %" PRIuPTR " registrations: %d\n",
> -                            block->index, chunk, (uintptr_t)chunk_start,
> -                            (uintptr_t)chunk_end, host_addr,
> -                            (uintptr_t)block->local_host_addr,
> -                            rdma->total_registrations);
> -            return -1;
> +        block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> +        if (!block->pmr[chunk] &&
> +            errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> +            access |= IBV_ACCESS_ON_DEMAND;
> +            /* register ODP mr */
> +            block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> +            trace_qemu_rdma_register_odp_mr(block->block_name);
>           }
> -        rdma->total_registrations++;
>       }
> +    if (!block->pmr[chunk]) {
> +        perror("Failed to register chunk!");
> +        fprintf(stderr, "Chunk details: block: %d chunk index %d"
> +                        " start %" PRIuPTR " end %" PRIuPTR
> +                        " host %" PRIuPTR
> +                        " local %" PRIuPTR " registrations: %d\n",
> +                        block->index, chunk, (uintptr_t)chunk_start,
> +                        (uintptr_t)chunk_end, host_addr,
> +                        (uintptr_t)block->local_host_addr,
> +                        rdma->total_registrations);
> +        return -1;
> +    }
> +    rdma->total_registrations++;
>   
>       if (lkey) {
>           *lkey = block->pmr[chunk]->lkey;
> diff --git a/migration/trace-events b/migration/trace-events
> index a1c0f034ab8..5f6aa580def 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
>   qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
>   qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
>   qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
> +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
>   qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
>   qemu_rdma_registration_handle_finished(void) ""
>   qemu_rdma_registration_handle_ram_blocks(void) ""
Marcel Apfelbaum Aug. 23, 2021, 8:52 a.m. UTC | #2
Hi Zhijian,

On Mon, Aug 23, 2021 at 11:42 AM lizhijian@fujitsu.com
<lizhijian@fujitsu.com> wrote:
>
> CCing  Marcel
>
>
> On 23/08/2021 11:33, Li Zhijian wrote:
> > Previously, for the fsdax mem-backend-file, it will register failed with
> > Operation not supported. In this case, we can try to register it with
> > On-Demand Paging[1] like what rpma_mr_reg() does on rpma[2].
> >
> > [1]: https://community.mellanox.com/s/article/understanding-on-demand-paging--odp-x
> > [2]: http://pmem.io/rpma/manpages/v0.9.0/rpma_mr_reg.3
> >
> > CC: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
> > Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> >
> > ---
> > V2: add ODP sanity check and remove goto
> > ---
> >   migration/rdma.c       | 73 ++++++++++++++++++++++++++++++------------
> >   migration/trace-events |  1 +
> >   2 files changed, 54 insertions(+), 20 deletions(-)
> >
> > diff --git a/migration/rdma.c b/migration/rdma.c
> > index 5c2d113aa94..eb80431aae2 100644
> > --- a/migration/rdma.c
> > +++ b/migration/rdma.c
> > @@ -1117,19 +1117,47 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
> >       return 0;
> >   }
> >
> > +/* Check whether On-Demand Paging is supported by RDAM device */
> > +static bool rdma_support_odp(struct ibv_context *dev)
> > +{
> > +    struct ibv_device_attr_ex attr = {0};
> > +    int ret = ibv_query_device_ex(dev, NULL, &attr);
> > +    if (ret) {
> > +        return false;
> > +    }
> > +
> > +    if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
> > +        return true;
> > +    }
> > +
> > +    return false;
> > +}
> > +
> >   static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
> >   {
> >       int i;
> >       RDMALocalBlocks *local = &rdma->local_ram_blocks;
> >
> >       for (i = 0; i < local->nb_blocks; i++) {
> > +        int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
> > +
> >           local->block[i].mr =
> >               ibv_reg_mr(rdma->pd,
> >                       local->block[i].local_host_addr,
> > -                    local->block[i].length,
> > -                    IBV_ACCESS_LOCAL_WRITE |
> > -                    IBV_ACCESS_REMOTE_WRITE
> > +                    local->block[i].length, access
> >                       );
> > +
> > +        if (!local->block[i].mr &&
> > +            errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> > +                access |= IBV_ACCESS_ON_DEMAND;
> > +                /* register ODP mr */
> > +                local->block[i].mr =
> > +                    ibv_reg_mr(rdma->pd,
> > +                               local->block[i].local_host_addr,
> > +                               local->block[i].length, access);
> > +                trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
> > +        }
> > +
> >           if (!local->block[i].mr) {
> >               perror("Failed to register local dest ram block!");
> >               break;
> > @@ -1215,28 +1243,33 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
> >        */
> >       if (!block->pmr[chunk]) {
> >           uint64_t len = chunk_end - chunk_start;
> > +        int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
> > +                     0;
> >
> >           trace_qemu_rdma_register_and_get_keys(len, chunk_start);
> >
> > -        block->pmr[chunk] = ibv_reg_mr(rdma->pd,
> > -                chunk_start, len,
> > -                (rkey ? (IBV_ACCESS_LOCAL_WRITE |
> > -                        IBV_ACCESS_REMOTE_WRITE) : 0));
> > -
> > -        if (!block->pmr[chunk]) {
> > -            perror("Failed to register chunk!");
> > -            fprintf(stderr, "Chunk details: block: %d chunk index %d"
> > -                            " start %" PRIuPTR " end %" PRIuPTR
> > -                            " host %" PRIuPTR
> > -                            " local %" PRIuPTR " registrations: %d\n",
> > -                            block->index, chunk, (uintptr_t)chunk_start,
> > -                            (uintptr_t)chunk_end, host_addr,
> > -                            (uintptr_t)block->local_host_addr,
> > -                            rdma->total_registrations);
> > -            return -1;
> > +        block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> > +        if (!block->pmr[chunk] &&
> > +            errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
> > +            access |= IBV_ACCESS_ON_DEMAND;
> > +            /* register ODP mr */
> > +            block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> > +            trace_qemu_rdma_register_odp_mr(block->block_name);
> >           }
> > -        rdma->total_registrations++;
> >       }
> > +    if (!block->pmr[chunk]) {
> > +        perror("Failed to register chunk!");
> > +        fprintf(stderr, "Chunk details: block: %d chunk index %d"
> > +                        " start %" PRIuPTR " end %" PRIuPTR
> > +                        " host %" PRIuPTR
> > +                        " local %" PRIuPTR " registrations: %d\n",
> > +                        block->index, chunk, (uintptr_t)chunk_start,
> > +                        (uintptr_t)chunk_end, host_addr,
> > +                        (uintptr_t)block->local_host_addr,
> > +                        rdma->total_registrations);
> > +        return -1;
> > +    }
> > +    rdma->total_registrations++;
> >
> >       if (lkey) {
> >           *lkey = block->pmr[chunk]->lkey;
> > diff --git a/migration/trace-events b/migration/trace-events
> > index a1c0f034ab8..5f6aa580def 100644
> > --- a/migration/trace-events
> > +++ b/migration/trace-events
> > @@ -212,6 +212,7 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
> >   qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
> >   qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
> >   qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
> > +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
> >   qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
> >   qemu_rdma_registration_handle_finished(void) ""
> >   qemu_rdma_registration_handle_ram_blocks(void) ""

Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>

Thanks,
Marcel
diff mbox series

Patch

diff --git a/migration/rdma.c b/migration/rdma.c
index 5c2d113aa94..eb80431aae2 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1117,19 +1117,47 @@  static int qemu_rdma_alloc_qp(RDMAContext *rdma)
     return 0;
 }
 
+/* Check whether On-Demand Paging is supported by RDAM device */
+static bool rdma_support_odp(struct ibv_context *dev)
+{
+    struct ibv_device_attr_ex attr = {0};
+    int ret = ibv_query_device_ex(dev, NULL, &attr);
+    if (ret) {
+        return false;
+    }
+
+    if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
+        return true;
+    }
+
+    return false;
+}
+
 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 {
     int i;
     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 
     for (i = 0; i < local->nb_blocks; i++) {
+        int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
         local->block[i].mr =
             ibv_reg_mr(rdma->pd,
                     local->block[i].local_host_addr,
-                    local->block[i].length,
-                    IBV_ACCESS_LOCAL_WRITE |
-                    IBV_ACCESS_REMOTE_WRITE
+                    local->block[i].length, access
                     );
+
+        if (!local->block[i].mr &&
+            errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+                access |= IBV_ACCESS_ON_DEMAND;
+                /* register ODP mr */
+                local->block[i].mr =
+                    ibv_reg_mr(rdma->pd,
+                               local->block[i].local_host_addr,
+                               local->block[i].length, access);
+                trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
+        }
+
         if (!local->block[i].mr) {
             perror("Failed to register local dest ram block!");
             break;
@@ -1215,28 +1243,33 @@  static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
      */
     if (!block->pmr[chunk]) {
         uint64_t len = chunk_end - chunk_start;
+        int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
+                     0;
 
         trace_qemu_rdma_register_and_get_keys(len, chunk_start);
 
-        block->pmr[chunk] = ibv_reg_mr(rdma->pd,
-                chunk_start, len,
-                (rkey ? (IBV_ACCESS_LOCAL_WRITE |
-                        IBV_ACCESS_REMOTE_WRITE) : 0));
-
-        if (!block->pmr[chunk]) {
-            perror("Failed to register chunk!");
-            fprintf(stderr, "Chunk details: block: %d chunk index %d"
-                            " start %" PRIuPTR " end %" PRIuPTR
-                            " host %" PRIuPTR
-                            " local %" PRIuPTR " registrations: %d\n",
-                            block->index, chunk, (uintptr_t)chunk_start,
-                            (uintptr_t)chunk_end, host_addr,
-                            (uintptr_t)block->local_host_addr,
-                            rdma->total_registrations);
-            return -1;
+        block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+        if (!block->pmr[chunk] &&
+            errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
+            access |= IBV_ACCESS_ON_DEMAND;
+            /* register ODP mr */
+            block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+            trace_qemu_rdma_register_odp_mr(block->block_name);
         }
-        rdma->total_registrations++;
     }
+    if (!block->pmr[chunk]) {
+        perror("Failed to register chunk!");
+        fprintf(stderr, "Chunk details: block: %d chunk index %d"
+                        " start %" PRIuPTR " end %" PRIuPTR
+                        " host %" PRIuPTR
+                        " local %" PRIuPTR " registrations: %d\n",
+                        block->index, chunk, (uintptr_t)chunk_start,
+                        (uintptr_t)chunk_end, host_addr,
+                        (uintptr_t)block->local_host_addr,
+                        rdma->total_registrations);
+        return -1;
+    }
+    rdma->total_registrations++;
 
     if (lkey) {
         *lkey = block->pmr[chunk]->lkey;
diff --git a/migration/trace-events b/migration/trace-events
index a1c0f034ab8..5f6aa580def 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -212,6 +212,7 @@  qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block
 qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d"
 qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
 qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
+qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
 qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
 qemu_rdma_registration_handle_finished(void) ""
 qemu_rdma_registration_handle_ram_blocks(void) ""