Message ID | 20210809175620.720923-13-ltykernel@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | x86/Hyper-V: Add Hyper-V Isolation VM support | expand |
From: Tianyu Lan <ltykernel@gmail.com> Sent: Monday, August 9, 2021 10:56 AM > The Subject line tag should be "hv_netvsc:". > In Isolation VM, all shared memory with host needs to mark visible > to host via hvcall. vmbus_establish_gpadl() has already done it for > netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_ > pagebuffer() still need to handle. Use DMA API to map/umap these > memory during sending/receiving packet and Hyper-V DMA ops callback > will use swiotlb function to allocate bounce buffer and copy data > from/to bounce buffer. > > Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com> > --- > drivers/net/hyperv/hyperv_net.h | 6 ++ > drivers/net/hyperv/netvsc.c | 144 +++++++++++++++++++++++++++++- > drivers/net/hyperv/rndis_filter.c | 2 + > include/linux/hyperv.h | 5 ++ > 4 files changed, 154 insertions(+), 3 deletions(-) > > diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h > index bc48855dff10..862419912bfb 100644 > --- a/drivers/net/hyperv/hyperv_net.h > +++ b/drivers/net/hyperv/hyperv_net.h > @@ -164,6 +164,7 @@ struct hv_netvsc_packet { > u32 total_bytes; > u32 send_buf_index; > u32 total_data_buflen; > + struct hv_dma_range *dma_range; > }; > > #define NETVSC_HASH_KEYLEN 40 > @@ -1074,6 +1075,7 @@ struct netvsc_device { > > /* Receive buffer allocated by us but manages by NetVSP */ > void *recv_buf; > + void *recv_original_buf; > u32 recv_buf_size; /* allocated bytes */ > u32 recv_buf_gpadl_handle; > u32 recv_section_cnt; > @@ -1082,6 +1084,8 @@ struct netvsc_device { > > /* Send buffer allocated by us */ > void *send_buf; > + void *send_original_buf; > + u32 send_buf_size; > u32 send_buf_gpadl_handle; > u32 send_section_cnt; > u32 send_section_size; > @@ -1730,4 +1734,6 @@ struct rndis_message { > #define RETRY_US_HI 10000 > #define RETRY_MAX 2000 /* >10 sec */ > > +void netvsc_dma_unmap(struct hv_device *hv_dev, > + struct hv_netvsc_packet *packet); > #endif /* _HYPERV_NET_H */ > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c > index 7bd935412853..fc312e5db4d5 100644 > --- a/drivers/net/hyperv/netvsc.c > +++ b/drivers/net/hyperv/netvsc.c > @@ -153,8 +153,21 @@ static void free_netvsc_device(struct rcu_head *head) > int i; > > kfree(nvdev->extension); > - vfree(nvdev->recv_buf); > - vfree(nvdev->send_buf); > + > + if (nvdev->recv_original_buf) { > + vunmap(nvdev->recv_buf); > + vfree(nvdev->recv_original_buf); > + } else { > + vfree(nvdev->recv_buf); > + } > + > + if (nvdev->send_original_buf) { > + vunmap(nvdev->send_buf); > + vfree(nvdev->send_original_buf); > + } else { > + vfree(nvdev->send_buf); > + } > + > kfree(nvdev->send_section_map); > > for (i = 0; i < VRSS_CHANNEL_MAX; i++) { > @@ -330,6 +343,27 @@ int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx) > return nvchan->mrc.slots ? 0 : -ENOMEM; > } > > +static void *netvsc_remap_buf(void *buf, unsigned long size) > +{ > + unsigned long *pfns; > + void *vaddr; > + int i; > + > + pfns = kcalloc(size / HV_HYP_PAGE_SIZE, sizeof(unsigned long), > + GFP_KERNEL); This assumes that the "size" argument is a multiple of PAGE_SIZE. I think that's true in all the use cases, but it would be safer to check. > + if (!pfns) > + return NULL; > + > + for (i = 0; i < size / HV_HYP_PAGE_SIZE; i++) > + pfns[i] = virt_to_hvpfn(buf + i * HV_HYP_PAGE_SIZE) > + + (ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT); > + > + vaddr = vmap_pfn(pfns, size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO); > + kfree(pfns); > + > + return vaddr; > +} This function appears to be a duplicate of hv_map_memory() in Patch 11 of this series. Is it possible to structure things so there is only one implementation? In any case, see the comment in hv_map_memory() about PAGE_SIZE vs HV_HYP_PAGE_SIZE and similar. > + > static int netvsc_init_buf(struct hv_device *device, > struct netvsc_device *net_device, > const struct netvsc_device_info *device_info) > @@ -340,6 +374,7 @@ static int netvsc_init_buf(struct hv_device *device, > unsigned int buf_size; > size_t map_words; > int i, ret = 0; > + void *vaddr; > > /* Get receive buffer area. */ > buf_size = device_info->recv_sections * device_info->recv_section_size; > @@ -375,6 +410,15 @@ static int netvsc_init_buf(struct hv_device *device, > goto cleanup; > } > > + if (hv_isolation_type_snp()) { > + vaddr = netvsc_remap_buf(net_device->recv_buf, buf_size); > + if (!vaddr) > + goto cleanup; > + > + net_device->recv_original_buf = net_device->recv_buf; > + net_device->recv_buf = vaddr; > + } > + > /* Notify the NetVsp of the gpadl handle */ > init_packet = &net_device->channel_init_pkt; > memset(init_packet, 0, sizeof(struct nvsp_message)); > @@ -477,6 +521,15 @@ static int netvsc_init_buf(struct hv_device *device, > goto cleanup; > } > > + if (hv_isolation_type_snp()) { > + vaddr = netvsc_remap_buf(net_device->send_buf, buf_size); > + if (!vaddr) > + goto cleanup; I don't think this error case is handled correctly. Doesn't the remapping of the recv buf need to be undone? > + > + net_device->send_original_buf = net_device->send_buf; > + net_device->send_buf = vaddr; > + } > + > /* Notify the NetVsp of the gpadl handle */ > init_packet = &net_device->channel_init_pkt; > memset(init_packet, 0, sizeof(struct nvsp_message)); > @@ -767,7 +820,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev, > > /* Notify the layer above us */ > if (likely(skb)) { > - const struct hv_netvsc_packet *packet > + struct hv_netvsc_packet *packet > = (struct hv_netvsc_packet *)skb->cb; > u32 send_index = packet->send_buf_index; > struct netvsc_stats *tx_stats; > @@ -783,6 +836,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev, > tx_stats->bytes += packet->total_bytes; > u64_stats_update_end(&tx_stats->syncp); > > + netvsc_dma_unmap(ndev_ctx->device_ctx, packet); > napi_consume_skb(skb, budget); > } > > @@ -947,6 +1001,82 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, > memset(dest, 0, padding); > } > > +void netvsc_dma_unmap(struct hv_device *hv_dev, > + struct hv_netvsc_packet *packet) > +{ > + u32 page_count = packet->cp_partial ? > + packet->page_buf_cnt - packet->rmsg_pgcnt : > + packet->page_buf_cnt; > + int i; > + > + if (!hv_is_isolation_supported()) > + return; > + > + if (!packet->dma_range) > + return; > + > + for (i = 0; i < page_count; i++) > + dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma, > + packet->dma_range[i].mapping_size, > + DMA_TO_DEVICE); > + > + kfree(packet->dma_range); > +} > + > +/* netvsc_dma_map - Map swiotlb bounce buffer with data page of > + * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation > + * VM. > + * > + * In isolation VM, netvsc send buffer has been marked visible to > + * host and so the data copied to send buffer doesn't need to use > + * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer() > + * may not be copied to send buffer and so these pages need to be > + * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do > + * that. The pfns in the struct hv_page_buffer need to be converted > + * to bounce buffer's pfn. The loop here is necessary and so not > + * use dma_map_sg() here. I think I understand why the loop is necessary, but it would be nice to add a bit more comment text to explain. The reason is that the entries in the page buffer array are not necessarily full pages of data. Each entry in the array has a separate offset and len that may be non-zero, even for entries in the middle of the array. And the entries are not physically contiguous. So each entry must be individually mapped rather than as a contiguous unit. > + */ > +int netvsc_dma_map(struct hv_device *hv_dev, > + struct hv_netvsc_packet *packet, > + struct hv_page_buffer *pb) > +{ > + u32 page_count = packet->cp_partial ? > + packet->page_buf_cnt - packet->rmsg_pgcnt : > + packet->page_buf_cnt; > + dma_addr_t dma; > + int i; > + > + if (!hv_is_isolation_supported()) > + return 0; > + > + packet->dma_range = kcalloc(page_count, > + sizeof(*packet->dma_range), > + GFP_KERNEL); > + if (!packet->dma_range) > + return -ENOMEM; > + > + for (i = 0; i < page_count; i++) { > + char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT) > + + pb[i].offset); > + u32 len = pb[i].len; > + > + dma = dma_map_single(&hv_dev->device, src, len, > + DMA_TO_DEVICE); > + if (dma_mapping_error(&hv_dev->device, dma)) { > + kfree(packet->dma_range); > + return -ENOMEM; > + } > + > + packet->dma_range[i].dma = dma; > + packet->dma_range[i].mapping_size = len; > + pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT; > + pb[i].offset = offset_in_hvpage(dma); > + pb[i].len = len; > + } > + > + return 0; > +} > + > static inline int netvsc_send_pkt( > struct hv_device *device, > struct hv_netvsc_packet *packet, > @@ -987,14 +1117,22 @@ static inline int netvsc_send_pkt( > > trace_nvsp_send_pkt(ndev, out_channel, rpkt); > > + packet->dma_range = NULL; > if (packet->page_buf_cnt) { > if (packet->cp_partial) > pb += packet->rmsg_pgcnt; > > + ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb); > + if (ret) > + return ret; I think this error case needs to set things up so sending the packet can be retried at the higher levels. The typical error is that swiotlb is out of bounce buffer memory. That's a transient condition. There's already code in this function to retry when the vmbus_sendpacket functions fails because the ring buffer is full, and running out of bounce buffer memory should probably take the same path. > + > ret = vmbus_sendpacket_pagebuffer(out_channel, > pb, packet->page_buf_cnt, > &nvmsg, sizeof(nvmsg), > req_id); > + > + if (ret) > + netvsc_dma_unmap(ndev_ctx->device_ctx, packet); > } else { > ret = vmbus_sendpacket(out_channel, > &nvmsg, sizeof(nvmsg), > diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c > index f6c9c2a670f9..448fcc325ed7 100644 > --- a/drivers/net/hyperv/rndis_filter.c > +++ b/drivers/net/hyperv/rndis_filter.c > @@ -361,6 +361,8 @@ static void rndis_filter_receive_response(struct net_device *ndev, > } > } > > + netvsc_dma_unmap(((struct net_device_context *) > + netdev_priv(ndev))->device_ctx, &request->pkt); > complete(&request->wait_event); > } else { > netdev_err(ndev, > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h > index 83fa567ad594..2ea638101645 100644 > --- a/include/linux/hyperv.h > +++ b/include/linux/hyperv.h > @@ -1601,6 +1601,11 @@ struct hyperv_service_callback { > void (*callback)(void *context); > }; > > +struct hv_dma_range { > + dma_addr_t dma; > + u32 mapping_size; > +}; > + > #define MAX_SRV_VER 0x7ffffff > extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen, > const int *fw_version, int fw_vercnt, > -- > 2.25.1
On Thu, Aug 19, 2021 at 06:14:51PM +0000, Michael Kelley wrote: > > + if (!pfns) > > + return NULL; > > + > > + for (i = 0; i < size / HV_HYP_PAGE_SIZE; i++) > > + pfns[i] = virt_to_hvpfn(buf + i * HV_HYP_PAGE_SIZE) > > + + (ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT); > > + > > + vaddr = vmap_pfn(pfns, size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO); > > + kfree(pfns); > > + > > + return vaddr; > > +} > > This function appears to be a duplicate of hv_map_memory() in Patch 11 of this > series. Is it possible to structure things so there is only one implementation? In So right now it it identical, but there is an important difference: the swiotlb memory is physically contiguous to start with, so we can do the simple remap using vmap_range as suggested in the last mail. The cases here are pretty weird in that netvsc_remap_buf is called right after vzalloc. That is we create _two_ mappings in vmalloc space right after another, where the original one is just used for establishing the "GPADL handle" and freeing the memory. In other words, the obvious thing to do here would be to use a vmalloc variant that allows to take the shared_gpa_boundary into account when setting up the PTEs. And here is somthing I need help from the x86 experts: does the CPU actually care about this shared_gpa_boundary? Or does it just matter for the generated DMA address? Does somehow have a good pointer to how this mechanism works?
On 8/20/2021 12:21 PM, hch@lst.de wrote: > On Thu, Aug 19, 2021 at 06:14:51PM +0000, Michael Kelley wrote: >>> + if (!pfns) >>> + return NULL; >>> + >>> + for (i = 0; i < size / HV_HYP_PAGE_SIZE; i++) >>> + pfns[i] = virt_to_hvpfn(buf + i * HV_HYP_PAGE_SIZE) >>> + + (ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT); >>> + >>> + vaddr = vmap_pfn(pfns, size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO); >>> + kfree(pfns); >>> + >>> + return vaddr; >>> +} >> >> This function appears to be a duplicate of hv_map_memory() in Patch 11 of this >> series. Is it possible to structure things so there is only one implementation? In > > So right now it it identical, but there is an important difference: > the swiotlb memory is physically contiguous to start with, so we can > do the simple remap using vmap_range as suggested in the last mail. > The cases here are pretty weird in that netvsc_remap_buf is called right > after vzalloc. That is we create _two_ mappings in vmalloc space right > after another, where the original one is just used for establishing the > "GPADL handle" and freeing the memory. In other words, the obvious thing > to do here would be to use a vmalloc variant that allows to take the > shared_gpa_boundary into account when setting up the PTEs. The buffer is allocated via vmalloc(). It needs to be marked as host visible via hyperv hvcall before being accessed via address space above shared_gpa_boundary. The hvcall is called in the vmbus_establish_gpadl(). > > And here is somthing I need help from the x86 experts: does the CPU > actually care about this shared_gpa_boundary? Or does it just matter > for the generated DMA address? Does somehow have a good pointer to > how this mechanism works? > The shared_gpa_boundary is vTOM feature of AMD SEV-SNP. Tom Lendacky introduced the feature in previous mail. I copy it here and please have a look. From Tom Lendacky: IIUC, this is using the vTOM feature of SEV-SNP. When this feature is enabled for a VMPL level, any physical memory addresses below vTOM are considered private/encrypted and any physical memory addresses above vTOM are considered shared/unencrypted. With this option, you don't need a fully enlightened guest that sets and clears page table encryption bits. You just need the DMA buffers to be allocated in the proper range above vTOM. See the section on "Virtual Machine Privilege Levels" in https://www.amd.com/system/files/TechDocs/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf.
On 8/19/21 11:21 PM, hch@lst.de wrote: > On Thu, Aug 19, 2021 at 06:14:51PM +0000, Michael Kelley wrote: >>> + if (!pfns) >>> + return NULL; >>> + >>> + for (i = 0; i < size / HV_HYP_PAGE_SIZE; i++) >>> + pfns[i] = virt_to_hvpfn(buf + i * HV_HYP_PAGE_SIZE) >>> + + (ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT); >>> + >>> + vaddr = vmap_pfn(pfns, size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO); >>> + kfree(pfns); >>> + >>> + return vaddr; >>> +} >> >> This function appears to be a duplicate of hv_map_memory() in Patch 11 of this >> series. Is it possible to structure things so there is only one implementation? In > > So right now it it identical, but there is an important difference: > the swiotlb memory is physically contiguous to start with, so we can > do the simple remap using vmap_range as suggested in the last mail. > The cases here are pretty weird in that netvsc_remap_buf is called right > after vzalloc. That is we create _two_ mappings in vmalloc space right > after another, where the original one is just used for establishing the > "GPADL handle" and freeing the memory. In other words, the obvious thing > to do here would be to use a vmalloc variant that allows to take the > shared_gpa_boundary into account when setting up the PTEs. > > And here is somthing I need help from the x86 experts: does the CPU > actually care about this shared_gpa_boundary? Or does it just matter > for the generated DMA address? Does somehow have a good pointer to > how this mechanism works? The CPU does care. Here's some info: APM Volume 2, Section 15.36.8: https://www.amd.com/system/files/TechDocs/24593.pdf AMD SEV-SNP Whitepaper, Virtual Machine Privilege Levels (~page 14): https://www.amd.com/system/files/TechDocs/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf Thanks, Tom >
On 8/20/2021 2:14 AM, Michael Kelley wrote: >> @@ -477,6 +521,15 @@ static int netvsc_init_buf(struct hv_device *device, >> goto cleanup; >> } >> >> + if (hv_isolation_type_snp()) { >> + vaddr = netvsc_remap_buf(net_device->send_buf, buf_size); >> + if (!vaddr) >> + goto cleanup; > I don't think this error case is handled correctly. Doesn't the remapping > of the recv buf need to be undone? > Yes, actually I thought to return error here and free_netvsc_device() will help to unmap recv_buffer finally. But I forget to set ret = -ENOMEM when add netvsc_remap_buf() helper.
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index bc48855dff10..862419912bfb 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -164,6 +164,7 @@ struct hv_netvsc_packet { u32 total_bytes; u32 send_buf_index; u32 total_data_buflen; + struct hv_dma_range *dma_range; }; #define NETVSC_HASH_KEYLEN 40 @@ -1074,6 +1075,7 @@ struct netvsc_device { /* Receive buffer allocated by us but manages by NetVSP */ void *recv_buf; + void *recv_original_buf; u32 recv_buf_size; /* allocated bytes */ u32 recv_buf_gpadl_handle; u32 recv_section_cnt; @@ -1082,6 +1084,8 @@ struct netvsc_device { /* Send buffer allocated by us */ void *send_buf; + void *send_original_buf; + u32 send_buf_size; u32 send_buf_gpadl_handle; u32 send_section_cnt; u32 send_section_size; @@ -1730,4 +1734,6 @@ struct rndis_message { #define RETRY_US_HI 10000 #define RETRY_MAX 2000 /* >10 sec */ +void netvsc_dma_unmap(struct hv_device *hv_dev, + struct hv_netvsc_packet *packet); #endif /* _HYPERV_NET_H */ diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 7bd935412853..fc312e5db4d5 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -153,8 +153,21 @@ static void free_netvsc_device(struct rcu_head *head) int i; kfree(nvdev->extension); - vfree(nvdev->recv_buf); - vfree(nvdev->send_buf); + + if (nvdev->recv_original_buf) { + vunmap(nvdev->recv_buf); + vfree(nvdev->recv_original_buf); + } else { + vfree(nvdev->recv_buf); + } + + if (nvdev->send_original_buf) { + vunmap(nvdev->send_buf); + vfree(nvdev->send_original_buf); + } else { + vfree(nvdev->send_buf); + } + kfree(nvdev->send_section_map); for (i = 0; i < VRSS_CHANNEL_MAX; i++) { @@ -330,6 +343,27 @@ int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx) return nvchan->mrc.slots ? 0 : -ENOMEM; } +static void *netvsc_remap_buf(void *buf, unsigned long size) +{ + unsigned long *pfns; + void *vaddr; + int i; + + pfns = kcalloc(size / HV_HYP_PAGE_SIZE, sizeof(unsigned long), + GFP_KERNEL); + if (!pfns) + return NULL; + + for (i = 0; i < size / HV_HYP_PAGE_SIZE; i++) + pfns[i] = virt_to_hvpfn(buf + i * HV_HYP_PAGE_SIZE) + + (ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT); + + vaddr = vmap_pfn(pfns, size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO); + kfree(pfns); + + return vaddr; +} + static int netvsc_init_buf(struct hv_device *device, struct netvsc_device *net_device, const struct netvsc_device_info *device_info) @@ -340,6 +374,7 @@ static int netvsc_init_buf(struct hv_device *device, unsigned int buf_size; size_t map_words; int i, ret = 0; + void *vaddr; /* Get receive buffer area. */ buf_size = device_info->recv_sections * device_info->recv_section_size; @@ -375,6 +410,15 @@ static int netvsc_init_buf(struct hv_device *device, goto cleanup; } + if (hv_isolation_type_snp()) { + vaddr = netvsc_remap_buf(net_device->recv_buf, buf_size); + if (!vaddr) + goto cleanup; + + net_device->recv_original_buf = net_device->recv_buf; + net_device->recv_buf = vaddr; + } + /* Notify the NetVsp of the gpadl handle */ init_packet = &net_device->channel_init_pkt; memset(init_packet, 0, sizeof(struct nvsp_message)); @@ -477,6 +521,15 @@ static int netvsc_init_buf(struct hv_device *device, goto cleanup; } + if (hv_isolation_type_snp()) { + vaddr = netvsc_remap_buf(net_device->send_buf, buf_size); + if (!vaddr) + goto cleanup; + + net_device->send_original_buf = net_device->send_buf; + net_device->send_buf = vaddr; + } + /* Notify the NetVsp of the gpadl handle */ init_packet = &net_device->channel_init_pkt; memset(init_packet, 0, sizeof(struct nvsp_message)); @@ -767,7 +820,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev, /* Notify the layer above us */ if (likely(skb)) { - const struct hv_netvsc_packet *packet + struct hv_netvsc_packet *packet = (struct hv_netvsc_packet *)skb->cb; u32 send_index = packet->send_buf_index; struct netvsc_stats *tx_stats; @@ -783,6 +836,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev, tx_stats->bytes += packet->total_bytes; u64_stats_update_end(&tx_stats->syncp); + netvsc_dma_unmap(ndev_ctx->device_ctx, packet); napi_consume_skb(skb, budget); } @@ -947,6 +1001,82 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, memset(dest, 0, padding); } +void netvsc_dma_unmap(struct hv_device *hv_dev, + struct hv_netvsc_packet *packet) +{ + u32 page_count = packet->cp_partial ? + packet->page_buf_cnt - packet->rmsg_pgcnt : + packet->page_buf_cnt; + int i; + + if (!hv_is_isolation_supported()) + return; + + if (!packet->dma_range) + return; + + for (i = 0; i < page_count; i++) + dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma, + packet->dma_range[i].mapping_size, + DMA_TO_DEVICE); + + kfree(packet->dma_range); +} + +/* netvsc_dma_map - Map swiotlb bounce buffer with data page of + * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation + * VM. + * + * In isolation VM, netvsc send buffer has been marked visible to + * host and so the data copied to send buffer doesn't need to use + * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer() + * may not be copied to send buffer and so these pages need to be + * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do + * that. The pfns in the struct hv_page_buffer need to be converted + * to bounce buffer's pfn. The loop here is necessary and so not + * use dma_map_sg() here. + */ +int netvsc_dma_map(struct hv_device *hv_dev, + struct hv_netvsc_packet *packet, + struct hv_page_buffer *pb) +{ + u32 page_count = packet->cp_partial ? + packet->page_buf_cnt - packet->rmsg_pgcnt : + packet->page_buf_cnt; + dma_addr_t dma; + int i; + + if (!hv_is_isolation_supported()) + return 0; + + packet->dma_range = kcalloc(page_count, + sizeof(*packet->dma_range), + GFP_KERNEL); + if (!packet->dma_range) + return -ENOMEM; + + for (i = 0; i < page_count; i++) { + char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT) + + pb[i].offset); + u32 len = pb[i].len; + + dma = dma_map_single(&hv_dev->device, src, len, + DMA_TO_DEVICE); + if (dma_mapping_error(&hv_dev->device, dma)) { + kfree(packet->dma_range); + return -ENOMEM; + } + + packet->dma_range[i].dma = dma; + packet->dma_range[i].mapping_size = len; + pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT; + pb[i].offset = offset_in_hvpage(dma); + pb[i].len = len; + } + + return 0; +} + static inline int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, @@ -987,14 +1117,22 @@ static inline int netvsc_send_pkt( trace_nvsp_send_pkt(ndev, out_channel, rpkt); + packet->dma_range = NULL; if (packet->page_buf_cnt) { if (packet->cp_partial) pb += packet->rmsg_pgcnt; + ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb); + if (ret) + return ret; + ret = vmbus_sendpacket_pagebuffer(out_channel, pb, packet->page_buf_cnt, &nvmsg, sizeof(nvmsg), req_id); + + if (ret) + netvsc_dma_unmap(ndev_ctx->device_ctx, packet); } else { ret = vmbus_sendpacket(out_channel, &nvmsg, sizeof(nvmsg), diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index f6c9c2a670f9..448fcc325ed7 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -361,6 +361,8 @@ static void rndis_filter_receive_response(struct net_device *ndev, } } + netvsc_dma_unmap(((struct net_device_context *) + netdev_priv(ndev))->device_ctx, &request->pkt); complete(&request->wait_event); } else { netdev_err(ndev, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 83fa567ad594..2ea638101645 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1601,6 +1601,11 @@ struct hyperv_service_callback { void (*callback)(void *context); }; +struct hv_dma_range { + dma_addr_t dma; + u32 mapping_size; +}; + #define MAX_SRV_VER 0x7ffffff extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen, const int *fw_version, int fw_vercnt,