diff mbox series

[RFC,15/45] KVM: arm64: pkvm: Add __pkvm_host_share/unshare_dma()

Message ID 20230201125328.2186498-16-jean-philippe@linaro.org (mailing list archive)
State New, archived
Headers show
Series KVM: Arm SMMUv3 driver for pKVM | expand

Commit Message

Jean-Philippe Brucker Feb. 1, 2023, 12:52 p.m. UTC
Host pages mapped in the SMMU must not be donated to the guest or
hypervisor, since the host could then use DMA to break confidentiality.
Mark them shared in the host stage-2 page tables, and keep a refcount in
the hyp vmemmap.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   3 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 185 ++++++++++++++++++
 2 files changed, 188 insertions(+)

Comments

Zhang, Tina Feb. 4, 2023, 12:51 p.m. UTC | #1
On 2/1/23 20:52, Jean-Philippe Brucker wrote:
> Host pages mapped in the SMMU must not be donated to the guest or
> hypervisor, since the host could then use DMA to break confidentiality.
> Mark them shared in the host stage-2 page tables, and keep a refcount in
> the hyp vmemmap.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>   arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   3 +
>   arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 185 ++++++++++++++++++
>   2 files changed, 188 insertions(+)
> 
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> index 021825aee854..a363d58a998b 100644
> --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> @@ -58,6 +58,7 @@ enum pkvm_component_id {
>   	PKVM_ID_HOST,
>   	PKVM_ID_HYP,
>   	PKVM_ID_GUEST,
> +	PKVM_ID_IOMMU,
>   };
>   
>   extern unsigned long hyp_nr_cpus;
> @@ -72,6 +73,8 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
>   int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
>   int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
>   int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
> +int __pkvm_host_share_dma(u64 phys_addr, size_t size, bool is_ram);
> +int __pkvm_host_unshare_dma(u64 phys_addr, size_t size);
>   
>   bool addr_is_memory(phys_addr_t phys);
>   int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> index 856673291d70..dcf08ce03790 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -1148,6 +1148,9 @@ static int check_share(struct pkvm_mem_share *share)
>   	case PKVM_ID_GUEST:
>   		ret = guest_ack_share(completer_addr, tx, share->completer_prot);
>   		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>   	default:
>   		ret = -EINVAL;
>   	}
> @@ -1185,6 +1188,9 @@ static int __do_share(struct pkvm_mem_share *share)
>   	case PKVM_ID_GUEST:
>   		ret = guest_complete_share(completer_addr, tx, share->completer_prot);
>   		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>   	default:
>   		ret = -EINVAL;
>   	}
> @@ -1239,6 +1245,9 @@ static int check_unshare(struct pkvm_mem_share *share)
>   	case PKVM_ID_HYP:
>   		ret = hyp_ack_unshare(completer_addr, tx);
>   		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>   	default:
>   		ret = -EINVAL;
>   	}
> @@ -1273,6 +1282,9 @@ static int __do_unshare(struct pkvm_mem_share *share)
>   	case PKVM_ID_HYP:
>   		ret = hyp_complete_unshare(completer_addr, tx);
>   		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>   	default:
>   		ret = -EINVAL;
>   	}
> @@ -1633,6 +1645,179 @@ void hyp_unpin_shared_mem(void *from, void *to)
>   	host_unlock_component();
>   }
>   
> +static int __host_check_page_dma_shared(phys_addr_t phys_addr)
> +{
> +	int ret;
> +	u64 hyp_addr;
> +
> +	/*
> +	 * The page is already refcounted. Make sure it's owned by the host, and
> +	 * not part of the hyp pool.
> +	 */
> +	ret = __host_check_page_state_range(phys_addr, PAGE_SIZE,
> +					    PKVM_PAGE_SHARED_OWNED);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * Refcounted and owned by host, means it's either mapped in the
> +	 * SMMU, or it's some VM/VCPU state shared with the hypervisor.
> +	 * The host has no reason to use a page for both.
> +	 */
> +	hyp_addr = (u64)hyp_phys_to_virt(phys_addr);
> +	return __hyp_check_page_state_range(hyp_addr, PAGE_SIZE, PKVM_NOPAGE);
> +}
> +
> +static int __pkvm_host_share_dma_page(phys_addr_t phys_addr, bool is_ram)
> +{
> +	int ret;
> +	struct hyp_page *p = hyp_phys_to_page(phys_addr);
> +	struct pkvm_mem_share share = {
> +		.tx	= {
> +			.nr_pages	= 1,
> +			.initiator	= {
> +				.id	= PKVM_ID_HOST,
> +				.addr	= phys_addr,
> +			},
> +			.completer	= {
> +				.id	= PKVM_ID_IOMMU,
> +			},
> +		},
> +	};
> +
> +	hyp_assert_lock_held(&host_mmu.lock);
> +	hyp_assert_lock_held(&pkvm_pgd_lock);
> +
> +	/*
> +	 * Some differences between handling of RAM and device memory:
> +	 * - The hyp vmemmap area for device memory is not backed by physical
> +	 *   pages in the hyp page tables.
> +	 * - Device memory is unmapped automatically under memory pressure
> +	 *   (host_stage2_try()) and the ownership information would be
> +	 *   discarded.
> +	 * We don't need to deal with that at the moment, because the host
> +	 * cannot share or donate device memory, only RAM.
> +	 *
> +	 * Since 'is_ram' is only a hint provided by the host, we do need to
> +	 * make sure of it.
> +	 */
> +	if (!is_ram)
> +		return addr_is_memory(phys_addr) ? -EINVAL : 0;
> +
> +	ret = hyp_page_ref_inc_return(p);
> +	BUG_ON(ret == 0);
> +	if (ret < 0)
> +		return ret;
> +	else if (ret == 1)
> +		ret = do_share(&share);
> +	else
> +		ret = __host_check_page_dma_shared(phys_addr);
> +
> +	if (ret)
> +		hyp_page_ref_dec(p);
> +
> +	return ret;
> +}
> +
> +static int __pkvm_host_unshare_dma_page(phys_addr_t phys_addr)
> +{
> +	struct hyp_page *p = hyp_phys_to_page(phys_addr);
> +	struct pkvm_mem_share share = {
> +		.tx	= {
> +			.nr_pages	= 1,
> +			.initiator	= {
> +				.id	= PKVM_ID_HOST,
> +				.addr	= phys_addr,
> +			},
> +			.completer	= {
> +				.id	= PKVM_ID_IOMMU,
> +			},
> +		},
> +	};
> +
> +	hyp_assert_lock_held(&host_mmu.lock);
> +	hyp_assert_lock_held(&pkvm_pgd_lock);
> +
> +	if (!addr_is_memory(phys_addr))
> +		return 0;
> +
> +	if (!hyp_page_ref_dec_and_test(p))
> +		return 0;
> +
> +	return do_unshare(&share);
> +}
> +
> +/*
> + * __pkvm_host_share_dma - Mark host memory as used for DMA
> + * @phys_addr:	physical address of the DMA region
> + * @size:	size of the DMA region
> + * @is_ram:	whether it is RAM or device memory
> + *
> + * We must not allow the host to donate pages that are mapped in the IOMMU for
> + * DMA. So:
> + * 1. Mark the host S2 entry as being owned by IOMMU
> + * 2. Refcount it, since a page may be mapped in multiple device address spaces.
> + *
> + * At some point we may end up needing more than the current 16 bits for
> + * refcounting, for example if all devices and sub-devices map the same MSI
> + * doorbell page. It will do for now.
> + */
> +int __pkvm_host_share_dma(phys_addr_t phys_addr, size_t size, bool is_ram)
> +{
> +	int i;
> +	int ret;
> +	size_t nr_pages = size >> PAGE_SHIFT;
> +
> +	if (WARN_ON(!PAGE_ALIGNED(phys_addr | size)))
> +		return -EINVAL;
> +
> +	host_lock_component();
> +	hyp_lock_component();
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		ret = __pkvm_host_share_dma_page(phys_addr + i * PAGE_SIZE,
> +						 is_ram);
Hi Jean,

I'm not familiar with ARM arch. Just out of curiosity. If pKVM-ARM 
populates the host stage-2 page table lazily, would there be a case that 
device driver in host triggers DMA with pages which have not been mapped 
to the host stage-2 page table yet? How do we handle this situation?

Regards,
-Tina

> +		if (ret)
> +			break;
> +	}
> +
> +	if (ret) {
> +		for (--i; i >= 0; --i)
> +			__pkvm_host_unshare_dma_page(phys_addr + i * PAGE_SIZE);
> +	}
> +
> +	hyp_unlock_component();
> +	host_unlock_component();
> +
> +	return ret;
> +}
> +
> +int __pkvm_host_unshare_dma(phys_addr_t phys_addr, size_t size)
> +{
> +	int i;
> +	int ret;
> +	size_t nr_pages = size >> PAGE_SHIFT;
> +
> +	host_lock_component();
> +	hyp_lock_component();
> +
> +	/*
> +	 * We end up here after the caller successfully unmapped the page from
> +	 * the IOMMU table. Which means that a ref is held, the page is shared
> +	 * in the host s2, there can be no failure.
> +	 */
> +	for (i = 0; i < nr_pages; i++) {
> +		ret = __pkvm_host_unshare_dma_page(phys_addr + i * PAGE_SIZE);
> +		if (ret)
> +			break;
> +	}
> +
> +	hyp_unlock_component();
> +	host_unlock_component();
> +
> +	return ret;
> +}
> +
>   int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
>   {
>   	int ret;
Jean-Philippe Brucker Feb. 6, 2023, 12:13 p.m. UTC | #2
Hi Tina,

On Sat, Feb 04, 2023 at 08:51:38PM +0800, tina.zhang wrote:
> > +int __pkvm_host_share_dma(phys_addr_t phys_addr, size_t size, bool is_ram)
> > +{
> > +	int i;
> > +	int ret;
> > +	size_t nr_pages = size >> PAGE_SHIFT;
> > +
> > +	if (WARN_ON(!PAGE_ALIGNED(phys_addr | size)))
> > +		return -EINVAL;
> > +
> > +	host_lock_component();
> > +	hyp_lock_component();
> > +
> > +	for (i = 0; i < nr_pages; i++) {
> > +		ret = __pkvm_host_share_dma_page(phys_addr + i * PAGE_SIZE,
> > +						 is_ram);
> Hi Jean,
> 
> I'm not familiar with ARM arch. Just out of curiosity. If pKVM-ARM populates
> the host stage-2 page table lazily, would there be a case that device driver
> in host triggers DMA with pages which have not been mapped to the host
> stage-2 page table yet? How do we handle this situation?

It's possible that the host asks the hypervisor to map on the IOMMU side
a page that is not yet mapped on the CPU side. In general before calling
map_pages() the host zero-initializes the page, triggering a page fault
which creates the mapping, so this case is rare. But if it happens,
__pkvm_host_share_dma() will create the CPU stage-2 mapping:

 __pkvm_host_share_dma()
  do_share()
   host_initiate_share()
    host_stage2_idmap_locked()

which creates a valid identity mapping, along with the ownership
information PVKM_PAGE_SHARED_OWNED. That ownership info is really all we
need here, to prevent future donations to guests or hyp. Since the SMMU
side uses separate stage-2 page tables, we don't actually need to create a
valid mapping on the CPU side yet, that's just how pKVM's mem_protect
currently works. But I don't think it hurts to create the mapping right
away instead of waiting for the CPU page fault, because the host will
likely access the page soon to read what the device wrote there.

Thanks,
Jean
Zhang, Tina Feb. 7, 2023, 2:37 a.m. UTC | #3
Hi Jean,

On 2/6/23 20:13, Jean-Philippe Brucker wrote:
> Hi Tina,
> 
> On Sat, Feb 04, 2023 at 08:51:38PM +0800, tina.zhang wrote:
>>> +int __pkvm_host_share_dma(phys_addr_t phys_addr, size_t size, bool is_ram)
>>> +{
>>> +	int i;
>>> +	int ret;
>>> +	size_t nr_pages = size >> PAGE_SHIFT;
>>> +
>>> +	if (WARN_ON(!PAGE_ALIGNED(phys_addr | size)))
>>> +		return -EINVAL;
>>> +
>>> +	host_lock_component();
>>> +	hyp_lock_component();
>>> +
>>> +	for (i = 0; i < nr_pages; i++) {
>>> +		ret = __pkvm_host_share_dma_page(phys_addr + i * PAGE_SIZE,
>>> +						 is_ram);
>> Hi Jean,
>>
>> I'm not familiar with ARM arch. Just out of curiosity. If pKVM-ARM populates
>> the host stage-2 page table lazily, would there be a case that device driver
>> in host triggers DMA with pages which have not been mapped to the host
>> stage-2 page table yet? How do we handle this situation?
> 
> It's possible that the host asks the hypervisor to map on the IOMMU side
> a page that is not yet mapped on the CPU side. In general before calling
> map_pages() the host zero-initializes the page, triggering a page fault
> which creates the mapping, so this case is rare. But if it happens,
> __pkvm_host_share_dma() will create the CPU stage-2 mapping:
> 
>   __pkvm_host_share_dma()
>    do_share()
>     host_initiate_share()
>      host_stage2_idmap_locked()
> 
> which creates a valid identity mapping, along with the ownership
> information PVKM_PAGE_SHARED_OWNED. That ownership info is really all we
> need here, to prevent future donations to guests or hyp. Since the SMMU
> side uses separate stage-2 page tables, we don't actually need to create a
> valid mapping on the CPU side yet, that's just how pKVM's mem_protect
> currently works. But I don't think it hurts to create the mapping right
> away instead of waiting for the CPU page fault, because the host will
> likely access the page soon to read what the device wrote there.
Right. I was thinking if stage-2 lazy mapping method is being adopted, 
whether there would be a case that the check_share() couldn't pass, as 
the pte might not be valid at that time. After checking the logic, I see 
we check kvm_pte_valid(pte) as well as the value of pte. So I guess the 
check_share() can return successfully even w/o having triggered host CPU 
page fault. Thanks for the elaborating.

Regards,
-Tina

> 
> Thanks,
> Jean
>
Jean-Philippe Brucker Feb. 7, 2023, 10:39 a.m. UTC | #4
On Tue, Feb 07, 2023 at 10:37:51AM +0800, tina.zhang wrote:
> Hi Jean,
> 
> On 2/6/23 20:13, Jean-Philippe Brucker wrote:
> > Hi Tina,
> > 
> > On Sat, Feb 04, 2023 at 08:51:38PM +0800, tina.zhang wrote:
> > > > +int __pkvm_host_share_dma(phys_addr_t phys_addr, size_t size, bool is_ram)
> > > > +{
> > > > +	int i;
> > > > +	int ret;
> > > > +	size_t nr_pages = size >> PAGE_SHIFT;
> > > > +
> > > > +	if (WARN_ON(!PAGE_ALIGNED(phys_addr | size)))
> > > > +		return -EINVAL;
> > > > +
> > > > +	host_lock_component();
> > > > +	hyp_lock_component();
> > > > +
> > > > +	for (i = 0; i < nr_pages; i++) {
> > > > +		ret = __pkvm_host_share_dma_page(phys_addr + i * PAGE_SIZE,
> > > > +						 is_ram);
> > > Hi Jean,
> > > 
> > > I'm not familiar with ARM arch. Just out of curiosity. If pKVM-ARM populates
> > > the host stage-2 page table lazily, would there be a case that device driver
> > > in host triggers DMA with pages which have not been mapped to the host
> > > stage-2 page table yet? How do we handle this situation?
> > 
> > It's possible that the host asks the hypervisor to map on the IOMMU side
> > a page that is not yet mapped on the CPU side. In general before calling
> > map_pages() the host zero-initializes the page, triggering a page fault
> > which creates the mapping, so this case is rare. But if it happens,
> > __pkvm_host_share_dma() will create the CPU stage-2 mapping:
> > 
> >   __pkvm_host_share_dma()
> >    do_share()
> >     host_initiate_share()
> >      host_stage2_idmap_locked()
> > 
> > which creates a valid identity mapping, along with the ownership
> > information PVKM_PAGE_SHARED_OWNED. That ownership info is really all we
> > need here, to prevent future donations to guests or hyp. Since the SMMU
> > side uses separate stage-2 page tables, we don't actually need to create a
> > valid mapping on the CPU side yet, that's just how pKVM's mem_protect
> > currently works. But I don't think it hurts to create the mapping right
> > away instead of waiting for the CPU page fault, because the host will
> > likely access the page soon to read what the device wrote there.
> Right. I was thinking if stage-2 lazy mapping method is being adopted,
> whether there would be a case that the check_share() couldn't pass, as the
> pte might not be valid at that time. After checking the logic, I see we
> check kvm_pte_valid(pte) as well as the value of pte. So I guess the
> check_share() can return successfully even w/o having triggered host CPU
> page fault.

Yes, during check_share(), host_get_page_state() allows disabled PTE
unless the PTE already contains ownership information (in particular, if
the page has been donated to a guest or to the hypervisor, the PTE is
invalid and contains the non-zero owner ID.)

Thanks,
Jean
Mostafa Saleh Feb. 7, 2023, 12:53 p.m. UTC | #5
Hi Jean,

On Wed, Feb 01, 2023 at 12:52:59PM +0000, Jean-Philippe Brucker wrote:
> Host pages mapped in the SMMU must not be donated to the guest or
> hypervisor, since the host could then use DMA to break confidentiality.
> Mark them shared in the host stage-2 page tables, and keep a refcount in
> the hyp vmemmap.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> ---
>  arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   3 +
>  arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 185 ++++++++++++++++++
>  2 files changed, 188 insertions(+)
> 
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> index 021825aee854..a363d58a998b 100644
> --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> @@ -58,6 +58,7 @@ enum pkvm_component_id {
>  	PKVM_ID_HOST,
>  	PKVM_ID_HYP,
>  	PKVM_ID_GUEST,
> +	PKVM_ID_IOMMU,
>  };
>  
>  extern unsigned long hyp_nr_cpus;
> @@ -72,6 +73,8 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
>  int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
>  int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
>  int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
> +int __pkvm_host_share_dma(u64 phys_addr, size_t size, bool is_ram);
> +int __pkvm_host_unshare_dma(u64 phys_addr, size_t size);
>  
>  bool addr_is_memory(phys_addr_t phys);
>  int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> index 856673291d70..dcf08ce03790 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -1148,6 +1148,9 @@ static int check_share(struct pkvm_mem_share *share)
>  	case PKVM_ID_GUEST:
>  		ret = guest_ack_share(completer_addr, tx, share->completer_prot);
>  		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>  	default:
>  		ret = -EINVAL;
>  	}
> @@ -1185,6 +1188,9 @@ static int __do_share(struct pkvm_mem_share *share)
>  	case PKVM_ID_GUEST:
>  		ret = guest_complete_share(completer_addr, tx, share->completer_prot);
>  		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>  	default:
>  		ret = -EINVAL;
>  	}
> @@ -1239,6 +1245,9 @@ static int check_unshare(struct pkvm_mem_share *share)
>  	case PKVM_ID_HYP:
>  		ret = hyp_ack_unshare(completer_addr, tx);
>  		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>  	default:
>  		ret = -EINVAL;
>  	}
> @@ -1273,6 +1282,9 @@ static int __do_unshare(struct pkvm_mem_share *share)
>  	case PKVM_ID_HYP:
>  		ret = hyp_complete_unshare(completer_addr, tx);
>  		break;
> +	case PKVM_ID_IOMMU:
> +		ret = 0;
> +		break;
>  	default:
>  		ret = -EINVAL;
>  	}
> @@ -1633,6 +1645,179 @@ void hyp_unpin_shared_mem(void *from, void *to)
>  	host_unlock_component();
>  }
>  
> +static int __host_check_page_dma_shared(phys_addr_t phys_addr)
> +{
> +	int ret;
> +	u64 hyp_addr;
> +
> +	/*
> +	 * The page is already refcounted. Make sure it's owned by the host, and
> +	 * not part of the hyp pool.
> +	 */
> +	ret = __host_check_page_state_range(phys_addr, PAGE_SIZE,
> +					    PKVM_PAGE_SHARED_OWNED);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * Refcounted and owned by host, means it's either mapped in the
> +	 * SMMU, or it's some VM/VCPU state shared with the hypervisor.
> +	 * The host has no reason to use a page for both.
> +	 */
> +	hyp_addr = (u64)hyp_phys_to_virt(phys_addr);
> +	return __hyp_check_page_state_range(hyp_addr, PAGE_SIZE, PKVM_NOPAGE);

This works for hyp-host sharing, I am worried about scalability of this.
For example FFA(still on the list) adds a new entity that can share data
with host, and we would need an extra check for it.
https://lore.kernel.org/kvmarm/20221116170335.2341003-1-qperret@google.com/

One way I can think about this, is to use the SW bits in SMMU page table to
represent ownership, for example for PKVM_ID_IOMMU
do_share:
-completer: set SW bits in the SMMU page table to
PKVM_PAGE_SHARED_BORROWED

do_unshare:
-completer: set SW bit back to PKVM_PAGE_OWNED(I think this is good
enough for host ownership)

In __pkvm_host_share_dma_page
If page refcount > 1, it succeeds if only the page was borrowed in the
SMMU page table and shared in the host page table.


Thanks,
Mostafa
Jean-Philippe Brucker Feb. 10, 2023, 7:21 p.m. UTC | #6
On Tue, Feb 07, 2023 at 12:53:56PM +0000, Mostafa Saleh wrote:
> > +static int __host_check_page_dma_shared(phys_addr_t phys_addr)
> > +{
> > +	int ret;
> > +	u64 hyp_addr;
> > +
> > +	/*
> > +	 * The page is already refcounted. Make sure it's owned by the host, and
> > +	 * not part of the hyp pool.
> > +	 */
> > +	ret = __host_check_page_state_range(phys_addr, PAGE_SIZE,
> > +					    PKVM_PAGE_SHARED_OWNED);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/*
> > +	 * Refcounted and owned by host, means it's either mapped in the
> > +	 * SMMU, or it's some VM/VCPU state shared with the hypervisor.
> > +	 * The host has no reason to use a page for both.
> > +	 */
> > +	hyp_addr = (u64)hyp_phys_to_virt(phys_addr);
> > +	return __hyp_check_page_state_range(hyp_addr, PAGE_SIZE, PKVM_NOPAGE);
> 
> This works for hyp-host sharing, I am worried about scalability of this.
> For example FFA(still on the list) adds a new entity that can share data
> with host, and we would need an extra check for it.
> https://lore.kernel.org/kvmarm/20221116170335.2341003-1-qperret@google.com/

Right, although it looks like the ff-a support doesn't need a refcount at
the moment so passing such a page to iommu_map() would fail at
check_share() (but I may be wrong, I'll need another look). Even if it did
use a refcount, I think it may be fine to share it between different
entities: the refcount ensures that every sharing is undone before the
page can be donated to another entity, and a separate tracking ensures
that the host undoes the right thing (the io-pgtable mappings for pages
shared with the IOMMU, and the secure world for ff-a)

Regardless I agree that this doesn't scale, it gets too complex and I'd
prefer if tracking the page state was less subtle. I'll try to find
something more generic.

> 
> One way I can think about this, is to use the SW bits in SMMU page table to
> represent ownership, for example for PKVM_ID_IOMMU
> do_share:
> -completer: set SW bits in the SMMU page table to
> PKVM_PAGE_SHARED_BORROWED

I'm not sure I understand, would we add this in order to share a page that
is already mapped in the SMMU with another entity (e.g. ff-a)?  Doing it
this way would be difficult because there may be a lot of different SMMU
page tables mapping the page.

The fact that a page is mapped in the SMMU page table already indicates
that it's borrowed from the host so the SW bits seem redundant.

> 
> do_unshare:
> -completer: set SW bit back to PKVM_PAGE_OWNED(I think this is good
> enough for host ownership)
> 
> In __pkvm_host_share_dma_page
> If page refcount > 1, it succeeds if only the page was borrowed in the
> SMMU page table and shared in the host page table.

We could also move more information into the vmemmap, because struct
hyp_page still has some space for page state. It has six free bits at the
moment, which is enough for both owner ID and share state, and maybe we
could steal a couple more from the refcount and order fields if necessary.

With that the iommu_map() and unmap() path could also be made a lot more
scalable because they wouldn't need to take any page table locks when
updating page state, a cmpxchg may be sufficient.

Thanks,
Jean
diff mbox series

Patch

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 021825aee854..a363d58a998b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -58,6 +58,7 @@  enum pkvm_component_id {
 	PKVM_ID_HOST,
 	PKVM_ID_HYP,
 	PKVM_ID_GUEST,
+	PKVM_ID_IOMMU,
 };
 
 extern unsigned long hyp_nr_cpus;
@@ -72,6 +73,8 @@  int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
 int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
 int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
 int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+int __pkvm_host_share_dma(u64 phys_addr, size_t size, bool is_ram);
+int __pkvm_host_unshare_dma(u64 phys_addr, size_t size);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 856673291d70..dcf08ce03790 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1148,6 +1148,9 @@  static int check_share(struct pkvm_mem_share *share)
 	case PKVM_ID_GUEST:
 		ret = guest_ack_share(completer_addr, tx, share->completer_prot);
 		break;
+	case PKVM_ID_IOMMU:
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1185,6 +1188,9 @@  static int __do_share(struct pkvm_mem_share *share)
 	case PKVM_ID_GUEST:
 		ret = guest_complete_share(completer_addr, tx, share->completer_prot);
 		break;
+	case PKVM_ID_IOMMU:
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1239,6 +1245,9 @@  static int check_unshare(struct pkvm_mem_share *share)
 	case PKVM_ID_HYP:
 		ret = hyp_ack_unshare(completer_addr, tx);
 		break;
+	case PKVM_ID_IOMMU:
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1273,6 +1282,9 @@  static int __do_unshare(struct pkvm_mem_share *share)
 	case PKVM_ID_HYP:
 		ret = hyp_complete_unshare(completer_addr, tx);
 		break;
+	case PKVM_ID_IOMMU:
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1633,6 +1645,179 @@  void hyp_unpin_shared_mem(void *from, void *to)
 	host_unlock_component();
 }
 
+static int __host_check_page_dma_shared(phys_addr_t phys_addr)
+{
+	int ret;
+	u64 hyp_addr;
+
+	/*
+	 * The page is already refcounted. Make sure it's owned by the host, and
+	 * not part of the hyp pool.
+	 */
+	ret = __host_check_page_state_range(phys_addr, PAGE_SIZE,
+					    PKVM_PAGE_SHARED_OWNED);
+	if (ret)
+		return ret;
+
+	/*
+	 * Refcounted and owned by host, means it's either mapped in the
+	 * SMMU, or it's some VM/VCPU state shared with the hypervisor.
+	 * The host has no reason to use a page for both.
+	 */
+	hyp_addr = (u64)hyp_phys_to_virt(phys_addr);
+	return __hyp_check_page_state_range(hyp_addr, PAGE_SIZE, PKVM_NOPAGE);
+}
+
+static int __pkvm_host_share_dma_page(phys_addr_t phys_addr, bool is_ram)
+{
+	int ret;
+	struct hyp_page *p = hyp_phys_to_page(phys_addr);
+	struct pkvm_mem_share share = {
+		.tx	= {
+			.nr_pages	= 1,
+			.initiator	= {
+				.id	= PKVM_ID_HOST,
+				.addr	= phys_addr,
+			},
+			.completer	= {
+				.id	= PKVM_ID_IOMMU,
+			},
+		},
+	};
+
+	hyp_assert_lock_held(&host_mmu.lock);
+	hyp_assert_lock_held(&pkvm_pgd_lock);
+
+	/*
+	 * Some differences between handling of RAM and device memory:
+	 * - The hyp vmemmap area for device memory is not backed by physical
+	 *   pages in the hyp page tables.
+	 * - Device memory is unmapped automatically under memory pressure
+	 *   (host_stage2_try()) and the ownership information would be
+	 *   discarded.
+	 * We don't need to deal with that at the moment, because the host
+	 * cannot share or donate device memory, only RAM.
+	 *
+	 * Since 'is_ram' is only a hint provided by the host, we do need to
+	 * make sure of it.
+	 */
+	if (!is_ram)
+		return addr_is_memory(phys_addr) ? -EINVAL : 0;
+
+	ret = hyp_page_ref_inc_return(p);
+	BUG_ON(ret == 0);
+	if (ret < 0)
+		return ret;
+	else if (ret == 1)
+		ret = do_share(&share);
+	else
+		ret = __host_check_page_dma_shared(phys_addr);
+
+	if (ret)
+		hyp_page_ref_dec(p);
+
+	return ret;
+}
+
+static int __pkvm_host_unshare_dma_page(phys_addr_t phys_addr)
+{
+	struct hyp_page *p = hyp_phys_to_page(phys_addr);
+	struct pkvm_mem_share share = {
+		.tx	= {
+			.nr_pages	= 1,
+			.initiator	= {
+				.id	= PKVM_ID_HOST,
+				.addr	= phys_addr,
+			},
+			.completer	= {
+				.id	= PKVM_ID_IOMMU,
+			},
+		},
+	};
+
+	hyp_assert_lock_held(&host_mmu.lock);
+	hyp_assert_lock_held(&pkvm_pgd_lock);
+
+	if (!addr_is_memory(phys_addr))
+		return 0;
+
+	if (!hyp_page_ref_dec_and_test(p))
+		return 0;
+
+	return do_unshare(&share);
+}
+
+/*
+ * __pkvm_host_share_dma - Mark host memory as used for DMA
+ * @phys_addr:	physical address of the DMA region
+ * @size:	size of the DMA region
+ * @is_ram:	whether it is RAM or device memory
+ *
+ * We must not allow the host to donate pages that are mapped in the IOMMU for
+ * DMA. So:
+ * 1. Mark the host S2 entry as being owned by IOMMU
+ * 2. Refcount it, since a page may be mapped in multiple device address spaces.
+ *
+ * At some point we may end up needing more than the current 16 bits for
+ * refcounting, for example if all devices and sub-devices map the same MSI
+ * doorbell page. It will do for now.
+ */
+int __pkvm_host_share_dma(phys_addr_t phys_addr, size_t size, bool is_ram)
+{
+	int i;
+	int ret;
+	size_t nr_pages = size >> PAGE_SHIFT;
+
+	if (WARN_ON(!PAGE_ALIGNED(phys_addr | size)))
+		return -EINVAL;
+
+	host_lock_component();
+	hyp_lock_component();
+
+	for (i = 0; i < nr_pages; i++) {
+		ret = __pkvm_host_share_dma_page(phys_addr + i * PAGE_SIZE,
+						 is_ram);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		for (--i; i >= 0; --i)
+			__pkvm_host_unshare_dma_page(phys_addr + i * PAGE_SIZE);
+	}
+
+	hyp_unlock_component();
+	host_unlock_component();
+
+	return ret;
+}
+
+int __pkvm_host_unshare_dma(phys_addr_t phys_addr, size_t size)
+{
+	int i;
+	int ret;
+	size_t nr_pages = size >> PAGE_SHIFT;
+
+	host_lock_component();
+	hyp_lock_component();
+
+	/*
+	 * We end up here after the caller successfully unmapped the page from
+	 * the IOMMU table. Which means that a ref is held, the page is shared
+	 * in the host s2, there can be no failure.
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		ret = __pkvm_host_unshare_dma_page(phys_addr + i * PAGE_SIZE);
+		if (ret)
+			break;
+	}
+
+	hyp_unlock_component();
+	host_unlock_component();
+
+	return ret;
+}
+
 int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
 {
 	int ret;