diff mbox series

[v1,14/14] iommu/arm-smmu-v3: Add arm_smmu_cache_invalidate_user

Message ID aa327f9ea61e5a4771c13e53639e33955b9acde3.1678348754.git.nicolinc@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add Nested Translation Support for SMMUv3 | expand

Commit Message

Nicolin Chen March 9, 2023, 10:53 a.m. UTC
Add arm_smmu_cache_invalidate_user() function for user space to invalidate
TLB entries and Context Descriptors, since either an IO page table entrie
or a Context Descriptor in the user space is still cached by the hardware.

The input user_data is defined in "struct iommu_hwpt_invalidate_arm_smmuv3"
that contains the essential data for corresponding invalidation commands.

Co-developed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)

Comments

Robin Murphy March 9, 2023, 2:49 p.m. UTC | #1
On 2023-03-09 10:53, Nicolin Chen wrote:
> Add arm_smmu_cache_invalidate_user() function for user space to invalidate
> TLB entries and Context Descriptors, since either an IO page table entrie
> or a Context Descriptor in the user space is still cached by the hardware.
> 
> The input user_data is defined in "struct iommu_hwpt_invalidate_arm_smmuv3"
> that contains the essential data for corresponding invalidation commands.
> 
> Co-developed-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 56 +++++++++++++++++++++
>   1 file changed, 56 insertions(+)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index ac63185ae268..7d73eab5e7f4 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -2880,9 +2880,65 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
>   	arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
>   }
>   
> +static void arm_smmu_cache_invalidate_user(struct iommu_domain *domain,
> +					   void *user_data)
> +{
> +	struct iommu_hwpt_invalidate_arm_smmuv3 *inv_info = user_data;
> +	struct arm_smmu_cmdq_ent cmd = { .opcode = inv_info->opcode };
> +	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> +	struct arm_smmu_device *smmu = smmu_domain->smmu;
> +	size_t granule_size = inv_info->granule_size;
> +	unsigned long iova = 0;
> +	size_t size = 0;
> +	int ssid = 0;
> +
> +	if (!smmu || !smmu_domain->s2 || domain->type != IOMMU_DOMAIN_NESTED)
> +		return;
> +
> +	switch (inv_info->opcode) {
> +	case CMDQ_OP_CFGI_CD:
> +	case CMDQ_OP_CFGI_CD_ALL:
> +		return arm_smmu_sync_cd(smmu_domain, inv_info->ssid, true);

Since we let the guest choose its own S1Fmt (and S1CDMax, yet not 
S1DSS?), how can we assume leaf = true here?

> +	case CMDQ_OP_TLBI_NH_VA:
> +		cmd.tlbi.asid = inv_info->asid;
> +		fallthrough;
> +	case CMDQ_OP_TLBI_NH_VAA:
> +		if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||

Non-range invalidations with TG=0 are perfectly legal, and should not be 
ignored.

> +		    granule_size & ~(1ULL << __ffs(granule_size)))

If that's intended to mean is_power_of_2(), please just use is_power_of_2().

> +			return;
> +
> +		iova = inv_info->range.start;
> +		size = inv_info->range.last - inv_info->range.start + 1;

If the design here is that user_data is so deeply driver-specific and 
special to the point that it can't possibly be passed as a type-checked 
union of the known and publicly-visible UAPI types that it is, wouldn't 
it make sense to just encode the whole thing in the expected format and 
not have to make these kinds of niggling little conversions at both ends?

> +		if (!size)
> +			return;
> +
> +		cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
> +		cmd.tlbi.leaf = inv_info->flags & IOMMU_SMMUV3_CMDQ_TLBI_VA_LEAF;
> +		__arm_smmu_tlb_inv_range(&cmd, iova, size, granule_size, smmu_domain);
> +		break;
> +	case CMDQ_OP_TLBI_NH_ASID:
> +		cmd.tlbi.asid = inv_info->asid;
> +		fallthrough;
> +	case CMDQ_OP_TLBI_NH_ALL:
> +		cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
> +		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
> +		break;
> +	case CMDQ_OP_ATC_INV:
> +		ssid = inv_info->ssid;
> +		iova = inv_info->range.start;
> +		size = inv_info->range.last - inv_info->range.start + 1;
> +		break;

Can we do any better than multiplying every single ATC_INV command, even 
for random bogus StreamIDs, into multiple commands across every physical 
device? In fact, I'm not entirely confident this isn't problematic, if 
the guest wishes to send invalidations for one device specifically while 
it's put some other device into a state where sending it a command would 
do something bad. At the very least, it's liable to be confusing if the 
guest sends a command for one StreamID but gets an error back for a 
different one.

And if we expect ATS, what about PRI? Per patch #4 you're currently 
offering that to the guest as well.

> +	default:
> +		return;

What about NSNH_ALL? That still needs to invalidate all the S1 context 
that the guest *thinks* it's invalidating.

Also, perhaps I've overlooked something obvious, but what's the 
procedure for reflecting illegal commands back to userspace? Some of the 
things we're silently ignoring here would be expected to raise 
CERROR_ILL. Same goes for all the other fault events which may occur due 
to invalid S1 config, come to think of it.

Thanks,
Robin.

> +	}
> +
> +	arm_smmu_atc_inv_domain(smmu_domain, ssid, iova, size);
> +}
> +
>   static const struct iommu_domain_ops arm_smmu_nested_domain_ops = {
>   	.attach_dev		= arm_smmu_attach_dev,
>   	.free			= arm_smmu_domain_free,
> +	.cache_invalidate_user	= arm_smmu_cache_invalidate_user,
>   };
>   
>   static struct iommu_domain *
Jason Gunthorpe March 9, 2023, 3:31 p.m. UTC | #2
On Thu, Mar 09, 2023 at 02:49:14PM +0000, Robin Murphy wrote:

> If the design here is that user_data is so deeply driver-specific and
> special to the point that it can't possibly be passed as a type-checked
> union of the known and publicly-visible UAPI types that it is, wouldn't it
> make sense to just encode the whole thing in the expected format and not
> have to make these kinds of niggling little conversions at both ends?

Yes, I suspect the design for ARM should have the input be the entire
actual command work queue entry. There is no reason to burn CPU cycles
in userspace marshalling it to something else and then decode it again
in the kernel. Organize things to point the ioctl directly at the
queue entry, and the kernel can do a single memcpy from guest
controlled pages to kernel memory then parse it?

More broadly, maybe should this be able to process a list of commands?
If the queue has a number of invalidations batching them to the kernel
sure would be nice.

Maybe also for Intel? Kevin?

> Also, perhaps I've overlooked something obvious, but what's the procedure
> for reflecting illegal commands back to userspace? Some of the things we're
> silently ignoring here would be expected to raise CERROR_ILL. Same goes for
> all the other fault events which may occur due to invalid S1 config, come to
> think of it.

Perhaps the ioctl should fail and the userpace viommu should inject
this CERROR_ILL?

But I'm also wondering if we are making a mistake to not just have the
kernel driver to expose a SW work queue in its native format and the
ioctl is only just 'read the queue'. Then it could (asynchronously!)
push back answers, real or emulated, as well, including all error
indications.

I think we got down this synchronous one-ioctl-per-invalidation path
because that was what the original generic stuff wanted to do. Is it
what we really want? Kevin what is your perspective?

Jason
Nicolin Chen March 10, 2023, 3:51 a.m. UTC | #3
On Thu, Mar 09, 2023 at 02:49:14PM +0000, Robin Murphy wrote:
> External email: Use caution opening links or attachments
> 
> 
> On 2023-03-09 10:53, Nicolin Chen wrote:
> > Add arm_smmu_cache_invalidate_user() function for user space to invalidate
> > TLB entries and Context Descriptors, since either an IO page table entrie
> > or a Context Descriptor in the user space is still cached by the hardware.
> > 
> > The input user_data is defined in "struct iommu_hwpt_invalidate_arm_smmuv3"
> > that contains the essential data for corresponding invalidation commands.
> > 
> > Co-developed-by: Eric Auger <eric.auger@redhat.com>
> > Signed-off-by: Eric Auger <eric.auger@redhat.com>
> > Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> > ---
> >   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 56 +++++++++++++++++++++
> >   1 file changed, 56 insertions(+)
> > 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > index ac63185ae268..7d73eab5e7f4 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > @@ -2880,9 +2880,65 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
> >       arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
> >   }
> > 
> > +static void arm_smmu_cache_invalidate_user(struct iommu_domain *domain,
> > +                                        void *user_data)
> > +{
> > +     struct iommu_hwpt_invalidate_arm_smmuv3 *inv_info = user_data;
> > +     struct arm_smmu_cmdq_ent cmd = { .opcode = inv_info->opcode };
> > +     struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> > +     struct arm_smmu_device *smmu = smmu_domain->smmu;
> > +     size_t granule_size = inv_info->granule_size;
> > +     unsigned long iova = 0;
> > +     size_t size = 0;
> > +     int ssid = 0;
> > +
> > +     if (!smmu || !smmu_domain->s2 || domain->type != IOMMU_DOMAIN_NESTED)
> > +             return;
> > +
> > +     switch (inv_info->opcode) {
> > +     case CMDQ_OP_CFGI_CD:
> > +     case CMDQ_OP_CFGI_CD_ALL:
> > +             return arm_smmu_sync_cd(smmu_domain, inv_info->ssid, true);
> 
> Since we let the guest choose its own S1Fmt (and S1CDMax, yet not
> S1DSS?), how can we assume leaf = true here?

The s1dss is forwarded in the user_data structure too. So, the
driver should have set that too down to a nested STE. Will add
this missing pathway.

And you are right that the guest OS can use a 2-level table, so
we should set leaf = false to cover all cases, I think.

> > +     case CMDQ_OP_TLBI_NH_VA:
> > +             cmd.tlbi.asid = inv_info->asid;
> > +             fallthrough;
> > +     case CMDQ_OP_TLBI_NH_VAA:
> > +             if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||
> 
> Non-range invalidations with TG=0 are perfectly legal, and should not be
> ignored.

I assume that you are talking about the pgsize_bitmap check.

QEMU embeds a !tg case into the granule_size [1]. So it might
not be straightforward to cover that case. Let me see how to
untangle different cases and handle them accordingly.

[1] https://patchew.org/QEMU/20200824094811.15439-1-peter.maydell@linaro.org/20200824094811.15439-9-peter.maydell@linaro.org/

> > +                 granule_size & ~(1ULL << __ffs(granule_size)))
> 
> If that's intended to mean is_power_of_2(), please just use is_power_of_2().
> 
> > +                     return;
> > +
> > +             iova = inv_info->range.start;
> > +             size = inv_info->range.last - inv_info->range.start + 1;
> 
> If the design here is that user_data is so deeply driver-specific and
> special to the point that it can't possibly be passed as a type-checked
> union of the known and publicly-visible UAPI types that it is, wouldn't
> it make sense to just encode the whole thing in the expected format and
> not have to make these kinds of niggling little conversions at both ends?

Hmm, that makes sense to me.

I just tracked back the history of Eric's previous work. There
was a mismatch between guest and host that RIL isn't supported
by the hardware. Now, guest can have whatever information it'd
need from the host to send supported instructions.

> > +             if (!size)
> > +                     return;
> > +
> > +             cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
> > +             cmd.tlbi.leaf = inv_info->flags & IOMMU_SMMUV3_CMDQ_TLBI_VA_LEAF;
> > +             __arm_smmu_tlb_inv_range(&cmd, iova, size, granule_size, smmu_domain);
> > +             break;
> > +     case CMDQ_OP_TLBI_NH_ASID:
> > +             cmd.tlbi.asid = inv_info->asid;
> > +             fallthrough;
> > +     case CMDQ_OP_TLBI_NH_ALL:
> > +             cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
> > +             arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
> > +             break;
> > +     case CMDQ_OP_ATC_INV:
> > +             ssid = inv_info->ssid;
> > +             iova = inv_info->range.start;
> > +             size = inv_info->range.last - inv_info->range.start + 1;
> > +             break;
> 
> Can we do any better than multiplying every single ATC_INV command, even
> for random bogus StreamIDs, into multiple commands across every physical
> device? In fact, I'm not entirely confident this isn't problematic, if
> the guest wishes to send invalidations for one device specifically while
> it's put some other device into a state where sending it a command would
> do something bad. At the very least, it's liable to be confusing if the
> guest sends a command for one StreamID but gets an error back for a
> different one.

We'd need here an sid translation from the guest value to the
host value to specify a device, so as not to multiply the cmd
with the device list, if I understand it correctly?

> And if we expect ATS, what about PRI? Per patch #4 you're currently
> offering that to the guest as well.

Oh, I should have probably blocked PRI. The PRI and the fault
injection will be followed after the basic 2-stage translation
patches. And I don't have a supporting hardware to test PRI.

> 
> > +     default:
> > +             return;
> 
> What about NSNH_ALL? That still needs to invalidate all the S1 context
> that the guest *thinks* it's invalidating.

NSNH_ALL is translated to NH_ALL at the guest level. But maybe
it should have been done here instead.

Thanks
Nic
Nicolin Chen March 10, 2023, 4:20 a.m. UTC | #4
On Thu, Mar 09, 2023 at 11:31:04AM -0400, Jason Gunthorpe wrote:
> On Thu, Mar 09, 2023 at 02:49:14PM +0000, Robin Murphy wrote:
> 
> > If the design here is that user_data is so deeply driver-specific and
> > special to the point that it can't possibly be passed as a type-checked
> > union of the known and publicly-visible UAPI types that it is, wouldn't it
> > make sense to just encode the whole thing in the expected format and not
> > have to make these kinds of niggling little conversions at both ends?
> 
> Yes, I suspect the design for ARM should have the input be the entire
> actual command work queue entry. There is no reason to burn CPU cycles
> in userspace marshalling it to something else and then decode it again
> in the kernel. Organize things to point the ioctl directly at the
> queue entry, and the kernel can do a single memcpy from guest
> controlled pages to kernel memory then parse it?

There still can be complications to do something straightforward
like that. Firstly, the consumer and producer indexes might need
to be synced between the host and kernel? Secondly, things like
SID and VMID fields in the commands need to be replaced manually
when the host kernel reads commands out, which means that there
need to be a translation table(s) in the host kernel to replace
those fields. These actually are parts of the features of VCMDQ
hardware itself.

Though I am not sure about the amounts of burning CPU cycles, it
at least can simplify the uAPI a bit and meanwhile address the
multiplying issue at the ATC_INV command that Robin raised, so
long as we ensure the consumer and producer indexes wouldn't be
messed between host and guest?

Thanks
Nic
Jason Gunthorpe March 10, 2023, 4:19 p.m. UTC | #5
On Thu, Mar 09, 2023 at 08:20:03PM -0800, Nicolin Chen wrote:
> On Thu, Mar 09, 2023 at 11:31:04AM -0400, Jason Gunthorpe wrote:
> > On Thu, Mar 09, 2023 at 02:49:14PM +0000, Robin Murphy wrote:
> > 
> > > If the design here is that user_data is so deeply driver-specific and
> > > special to the point that it can't possibly be passed as a type-checked
> > > union of the known and publicly-visible UAPI types that it is, wouldn't it
> > > make sense to just encode the whole thing in the expected format and not
> > > have to make these kinds of niggling little conversions at both ends?
> > 
> > Yes, I suspect the design for ARM should have the input be the entire
> > actual command work queue entry. There is no reason to burn CPU cycles
> > in userspace marshalling it to something else and then decode it again
> > in the kernel. Organize things to point the ioctl directly at the
> > queue entry, and the kernel can do a single memcpy from guest
> > controlled pages to kernel memory then parse it?
> 
> There still can be complications to do something straightforward
> like that. 

> Firstly, the consumer and producer indexes might need
> to be synced between the host and kernel?

No, qemu would handles this. The kernel would just read the command
entries it is told by qemu to read which qemu has already sorted out.

> Secondly, things like SID and VMID fields in the commands need to
> be replaced manually when the host kernel reads commands out, which
> means that there need to be a translation table(s) in the host
> kernel to replace those fields. These actually are parts of the
> features of VCMDQ hardware itself.

VMID should be ignored in a guest request.

SID translation is a good point. Can qemu do this? How does SID
translation work with VCMDQ in HW? (Jean this is exactly the sort of
tiny detail that the generic interface ignored)

What I'm broadly thinking is if we have to make the infrastructure for
VCMDQ HW accelerated invalidation then it is not a big step to also
have the kernel SW path use the same infrastructure just with a CPU
wake up instead of a MMIO poke.

Ie we have a SW version of VCMDQ to speed up SMMUv3 cases without HW
support.

I suspect the answer to Robin's question on how to handle errors is
the most important deciding factor. If we have to capture and relay
actual HW errors back to userspace that really suggests we should do
something different than a synchronous ioctl.

Jason
Robin Murphy March 10, 2023, 5:53 p.m. UTC | #6
On 2023-03-10 03:51, Nicolin Chen wrote:
> On Thu, Mar 09, 2023 at 02:49:14PM +0000, Robin Murphy wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 2023-03-09 10:53, Nicolin Chen wrote:
>>> Add arm_smmu_cache_invalidate_user() function for user space to invalidate
>>> TLB entries and Context Descriptors, since either an IO page table entrie
>>> or a Context Descriptor in the user space is still cached by the hardware.
>>>
>>> The input user_data is defined in "struct iommu_hwpt_invalidate_arm_smmuv3"
>>> that contains the essential data for corresponding invalidation commands.
>>>
>>> Co-developed-by: Eric Auger <eric.auger@redhat.com>
>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>>> ---
>>>    drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 56 +++++++++++++++++++++
>>>    1 file changed, 56 insertions(+)
>>>
>>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>> index ac63185ae268..7d73eab5e7f4 100644
>>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>>> @@ -2880,9 +2880,65 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
>>>        arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
>>>    }
>>>
>>> +static void arm_smmu_cache_invalidate_user(struct iommu_domain *domain,
>>> +                                        void *user_data)
>>> +{
>>> +     struct iommu_hwpt_invalidate_arm_smmuv3 *inv_info = user_data;
>>> +     struct arm_smmu_cmdq_ent cmd = { .opcode = inv_info->opcode };
>>> +     struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>> +     struct arm_smmu_device *smmu = smmu_domain->smmu;
>>> +     size_t granule_size = inv_info->granule_size;
>>> +     unsigned long iova = 0;
>>> +     size_t size = 0;
>>> +     int ssid = 0;
>>> +
>>> +     if (!smmu || !smmu_domain->s2 || domain->type != IOMMU_DOMAIN_NESTED)
>>> +             return;
>>> +
>>> +     switch (inv_info->opcode) {
>>> +     case CMDQ_OP_CFGI_CD:
>>> +     case CMDQ_OP_CFGI_CD_ALL:
>>> +             return arm_smmu_sync_cd(smmu_domain, inv_info->ssid, true);
>>
>> Since we let the guest choose its own S1Fmt (and S1CDMax, yet not
>> S1DSS?), how can we assume leaf = true here?
> 
> The s1dss is forwarded in the user_data structure too. So, the
> driver should have set that too down to a nested STE. Will add
> this missing pathway.
> 
> And you are right that the guest OS can use a 2-level table, so
> we should set leaf = false to cover all cases, I think.
> 
>>> +     case CMDQ_OP_TLBI_NH_VA:
>>> +             cmd.tlbi.asid = inv_info->asid;
>>> +             fallthrough;
>>> +     case CMDQ_OP_TLBI_NH_VAA:
>>> +             if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||
>>
>> Non-range invalidations with TG=0 are perfectly legal, and should not be
>> ignored.
> 
> I assume that you are talking about the pgsize_bitmap check.
> 
> QEMU embeds a !tg case into the granule_size [1]. So it might
> not be straightforward to cover that case. Let me see how to
> untangle different cases and handle them accordingly.

Oh, double-checking patch #2, that might be me misunderstanding the 
interface. I hadn't realised that the UAPI was apparently modelled on 
arm_smmu_tlb_inv_range_asid() rather than actual SMMU commands :)

I really think UAPI should reflect the hardware and encode TG and TTL 
directly. Especially since there's technically a flaw in the current 
driver where we assume TTL in cases where it isn't actually known, thus 
may potentially fail to invalidate level 2 block entries when removing a 
level 1 table, since io-pgtable passes the level 3 granule in that case. 
When range invalidation came along, the distinction between "all leaves 
are definitely at the last level" and "use last-level granularity to 
make sure everything at at any level is hit" started to matter, but the 
interface never caught up. It hasn't seemed desperately urgent to fix 
(who does 1GB+ unmaps outside of VFIO teardown anyway?), but we must 
definitely not bake the same mistake into user ABI.

Of course, there might then be cases where we need to transform 
non-range commands into range commands for the sake of workarounds, but 
that's our own problem to deal with.

> [1] https://patchew.org/QEMU/20200824094811.15439-1-peter.maydell@linaro.org/20200824094811.15439-9-peter.maydell@linaro.org/
> 
>>> +                 granule_size & ~(1ULL << __ffs(granule_size)))
>>
>> If that's intended to mean is_power_of_2(), please just use is_power_of_2().
>>
>>> +                     return;
>>> +
>>> +             iova = inv_info->range.start;
>>> +             size = inv_info->range.last - inv_info->range.start + 1;
>>
>> If the design here is that user_data is so deeply driver-specific and
>> special to the point that it can't possibly be passed as a type-checked
>> union of the known and publicly-visible UAPI types that it is, wouldn't
>> it make sense to just encode the whole thing in the expected format and
>> not have to make these kinds of niggling little conversions at both ends?
> 
> Hmm, that makes sense to me.
> 
> I just tracked back the history of Eric's previous work. There
> was a mismatch between guest and host that RIL isn't supported
> by the hardware. Now, guest can have whatever information it'd
> need from the host to send supported instructions.
> 
>>> +             if (!size)
>>> +                     return;
>>> +
>>> +             cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
>>> +             cmd.tlbi.leaf = inv_info->flags & IOMMU_SMMUV3_CMDQ_TLBI_VA_LEAF;
>>> +             __arm_smmu_tlb_inv_range(&cmd, iova, size, granule_size, smmu_domain);
>>> +             break;
>>> +     case CMDQ_OP_TLBI_NH_ASID:
>>> +             cmd.tlbi.asid = inv_info->asid;
>>> +             fallthrough;
>>> +     case CMDQ_OP_TLBI_NH_ALL:
>>> +             cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
>>> +             arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
>>> +             break;
>>> +     case CMDQ_OP_ATC_INV:
>>> +             ssid = inv_info->ssid;
>>> +             iova = inv_info->range.start;
>>> +             size = inv_info->range.last - inv_info->range.start + 1;
>>> +             break;
>>
>> Can we do any better than multiplying every single ATC_INV command, even
>> for random bogus StreamIDs, into multiple commands across every physical
>> device? In fact, I'm not entirely confident this isn't problematic, if
>> the guest wishes to send invalidations for one device specifically while
>> it's put some other device into a state where sending it a command would
>> do something bad. At the very least, it's liable to be confusing if the
>> guest sends a command for one StreamID but gets an error back for a
>> different one.
> 
> We'd need here an sid translation from the guest value to the
> host value to specify a device, so as not to multiply the cmd
> with the device list, if I understand it correctly?

I guess it depends on whether IOMMUFD is aware of the vSID->device 
relationships that the VMM is using. If so, then it should be OK for the 
VMM to pass through the vSID directly, and we can translate and 
sanity-check it internally. Otherwise, the interface might have to 
require the VMM to translate vSID->RID and pass the corresponding host 
RID, which we can then map back to a SID (userspace cannot do the full 
vSID->SID by itself, and even if it could that would probably be more 
awkward to validate).

>> And if we expect ATS, what about PRI? Per patch #4 you're currently
>> offering that to the guest as well.
> 
> Oh, I should have probably blocked PRI. The PRI and the fault
> injection will be followed after the basic 2-stage translation
> patches. And I don't have a supporting hardware to test PRI.
> 
>>
>>> +     default:
>>> +             return;
>>
>> What about NSNH_ALL? That still needs to invalidate all the S1 context
>> that the guest *thinks* it's invalidating.
> 
> NSNH_ALL is translated to NH_ALL at the guest level. But maybe
> it should have been done here instead.

Yes. It seems the worst of both worlds to have an interface which takes 
raw opcodes rather than an enum of supported commands, but still 
requires userspace to know which opcodes are supported and which ones 
don't work as expected even though they are entirely reasonable to use 
in the context of the stage-1-only SMMU being emulated.

Thanks,
Robin.
Jason Gunthorpe March 10, 2023, 6:49 p.m. UTC | #7
On Fri, Mar 10, 2023 at 05:53:46PM +0000, Robin Murphy wrote:

> I guess it depends on whether IOMMUFD is aware of the vSID->device
> relationships that the VMM is using. If so, then it should be OK for the VMM
> to pass through the vSID directly, and we can translate and sanity-check it
> internally. Otherwise, the interface might have to require the VMM to
> translate vSID->RID and pass the corresponding host RID, which we can then
> map back to a SID (userspace cannot do the full vSID->SID by itself, and
> even if it could that would probably be more awkward to validate).

The thing we have in iommufd is the "idevid" ie the handle for
the 'struct device' which is also the handle for the phyiscal SID in
the iommu..

The trouble is that there is not such an easy way for the iommu driver
to translate an idevid at this point since it would have to call out
from a built-in kernel driver to the iommufd module :( :( We have to
eventually solve that but I was hoping it wouldn't have to be on the
fast path...

So, having a vSID xarray in the driver that holds the struct device *
is possibly a good thing. Especially if the vCMDQ scheme needs the
same information.

Jason
Nicolin Chen March 11, 2023, 11:56 a.m. UTC | #8
On Fri, Mar 10, 2023 at 12:19:50PM -0400, Jason Gunthorpe wrote:
> On Thu, Mar 09, 2023 at 08:20:03PM -0800, Nicolin Chen wrote:
> > On Thu, Mar 09, 2023 at 11:31:04AM -0400, Jason Gunthorpe wrote:
> > > On Thu, Mar 09, 2023 at 02:49:14PM +0000, Robin Murphy wrote:
> > > 
> > > > If the design here is that user_data is so deeply driver-specific and
> > > > special to the point that it can't possibly be passed as a type-checked
> > > > union of the known and publicly-visible UAPI types that it is, wouldn't it
> > > > make sense to just encode the whole thing in the expected format and not
> > > > have to make these kinds of niggling little conversions at both ends?
> > > 
> > > Yes, I suspect the design for ARM should have the input be the entire
> > > actual command work queue entry. There is no reason to burn CPU cycles
> > > in userspace marshalling it to something else and then decode it again
> > > in the kernel. Organize things to point the ioctl directly at the
> > > queue entry, and the kernel can do a single memcpy from guest
> > > controlled pages to kernel memory then parse it?
> > 
> > There still can be complications to do something straightforward
> > like that. 
> 
> > Firstly, the consumer and producer indexes might need
> > to be synced between the host and kernel?
> 
> No, qemu would handles this. The kernel would just read the command
> entries it is told by qemu to read which qemu has already sorted out.

Then, instead of sending command, forwarding the consumer index?

> > Secondly, things like SID and VMID fields in the commands need to
> > be replaced manually when the host kernel reads commands out, which
> > means that there need to be a translation table(s) in the host
> > kernel to replace those fields. These actually are parts of the
> > features of VCMDQ hardware itself.
> 
> VMID should be ignored in a guest request.

The guest always set VMID fields to zero. But it should be then
handled in the host for most of TLBI commands.

VCMDQ has a register to set VMID explicitly so hardware can fill
the VMID fields spontaneously.

> SID translation is a good point. Can qemu do this? How does SID
> translation work with VCMDQ in HW? (Jean this is exactly the sort of
> tiny detail that the generic interface ignored)

VCMDQ has multiple pairs of MATCH and REPLACE registers to set
up hardware lookup table for SIDs. So hardware can do the job,
replacing the SID fields in the TLBI commands.

> What I'm broadly thinking is if we have to make the infrastructure for
> VCMDQ HW accelerated invalidation then it is not a big step to also
> have the kernel SW path use the same infrastructure just with a CPU
> wake up instead of a MMIO poke.
> 
> Ie we have a SW version of VCMDQ to speed up SMMUv3 cases without HW
> support.

Very interesting idea!

I recall that one difficulty is to pass the vSID from the guest
down to the host kernel driver and to link with the pSID. What I
did previously for VCMDQ was to set the SID_MATCH register with
iommu_group_id(group) and set the SID_REPLACE register with the
pSID. Then hyper will use the iommu_group_id to search for the
pair of the registers, and to set vSID. Perhaps we should think
of something smarter.

> I suspect the answer to Robin's question on how to handle errors is
> the most important deciding factor. If we have to capture and relay
> actual HW errors back to userspace that really suggests we should do
> something different than a synchronous ioctl.

A synchronous ioctl is to return some values other than defining
cache_invalidate_user as void, like we are doing now? An fault
injection pathway to report CERROR asynchronously is what we've
been doing though -- even with Eric's previous VFIO solution.

Thanks
Nic
Nicolin Chen March 11, 2023, 12:38 p.m. UTC | #9
On Fri, Mar 10, 2023 at 05:53:46PM +0000, Robin Murphy wrote:

> > > > +     case CMDQ_OP_TLBI_NH_VA:
> > > > +             cmd.tlbi.asid = inv_info->asid;
> > > > +             fallthrough;
> > > > +     case CMDQ_OP_TLBI_NH_VAA:
> > > > +             if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||
> > > 
> > > Non-range invalidations with TG=0 are perfectly legal, and should not be
> > > ignored.
> > 
> > I assume that you are talking about the pgsize_bitmap check.
> > 
> > QEMU embeds a !tg case into the granule_size [1]. So it might
> > not be straightforward to cover that case. Let me see how to
> > untangle different cases and handle them accordingly.
> 
> Oh, double-checking patch #2, that might be me misunderstanding the
> interface. I hadn't realised that the UAPI was apparently modelled on
> arm_smmu_tlb_inv_range_asid() rather than actual SMMU commands :)

Yea. In fact, most of the invalidation info in QEMU was packed
for the previously defined general cache invalidation structure,
and the range invalidation part is still not quite independent.

> I really think UAPI should reflect the hardware and encode TG and TTL
> directly. Especially since there's technically a flaw in the current
> driver where we assume TTL in cases where it isn't actually known, thus
> may potentially fail to invalidate level 2 block entries when removing a
> level 1 table, since io-pgtable passes the level 3 granule in that case.

Do you mean something like hw_info forwarding pgsize_bitmap/tg
to the guest? Or the other direction?

> When range invalidation came along, the distinction between "all leaves
> are definitely at the last level" and "use last-level granularity to
> make sure everything at at any level is hit" started to matter, but the
> interface never caught up. It hasn't seemed desperately urgent to fix
> (who does 1GB+ unmaps outside of VFIO teardown anyway?), but we must
> definitely not bake the same mistake into user ABI.
> 
> Of course, there might then be cases where we need to transform
> non-range commands into range commands for the sake of workarounds, but
> that's our own problem to deal with.

Noted it down.

> > > What about NSNH_ALL? That still needs to invalidate all the S1 context
> > > that the guest *thinks* it's invalidating.
> > 
> > NSNH_ALL is translated to NH_ALL at the guest level. But maybe
> > it should have been done here instead.
> 
> Yes. It seems the worst of both worlds to have an interface which takes
> raw opcodes rather than an enum of supported commands, but still
> requires userspace to know which opcodes are supported and which ones
> don't work as expected even though they are entirely reasonable to use
> in the context of the stage-1-only SMMU being emulated.

Maybe a list of supported TLBI commands via the hw_info uAPI?

Thanks
Nic
Nicolin Chen March 11, 2023, 12:53 p.m. UTC | #10
On Sat, Mar 11, 2023 at 03:56:56AM -0800, Nicolin Chen wrote:
 
> I recall that one difficulty is to pass the vSID from the guest
> down to the host kernel driver and to link with the pSID. What I
> did previously for VCMDQ was to set the SID_MATCH register with
> iommu_group_id(group) and set the SID_REPLACE register with the
> pSID. Then hyper will use the iommu_group_id to search for the
> pair of the registers, and to set vSID. Perhaps we should think
> of something smarter.

I just found that the CFGI_STE command has the SID field, yet
we just didn't pack it in the data structure for a hwpt_alloc
ioctl. So, perhaps it isn't that difficult at all. I'll try a
bit of a test run next week.

Thanks
Nic
Robin Murphy March 13, 2023, 1:07 p.m. UTC | #11
On 2023-03-11 12:38, Nicolin Chen wrote:
> On Fri, Mar 10, 2023 at 05:53:46PM +0000, Robin Murphy wrote:
> 
>>>>> +     case CMDQ_OP_TLBI_NH_VA:
>>>>> +             cmd.tlbi.asid = inv_info->asid;
>>>>> +             fallthrough;
>>>>> +     case CMDQ_OP_TLBI_NH_VAA:
>>>>> +             if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||
>>>>
>>>> Non-range invalidations with TG=0 are perfectly legal, and should not be
>>>> ignored.
>>>
>>> I assume that you are talking about the pgsize_bitmap check.
>>>
>>> QEMU embeds a !tg case into the granule_size [1]. So it might
>>> not be straightforward to cover that case. Let me see how to
>>> untangle different cases and handle them accordingly.
>>
>> Oh, double-checking patch #2, that might be me misunderstanding the
>> interface. I hadn't realised that the UAPI was apparently modelled on
>> arm_smmu_tlb_inv_range_asid() rather than actual SMMU commands :)
> 
> Yea. In fact, most of the invalidation info in QEMU was packed
> for the previously defined general cache invalidation structure,
> and the range invalidation part is still not quite independent.
> 
>> I really think UAPI should reflect the hardware and encode TG and TTL
>> directly. Especially since there's technically a flaw in the current
>> driver where we assume TTL in cases where it isn't actually known, thus
>> may potentially fail to invalidate level 2 block entries when removing a
>> level 1 table, since io-pgtable passes the level 3 granule in that case.
> 
> Do you mean something like hw_info forwarding pgsize_bitmap/tg
> to the guest? Or the other direction?

I mean if the interface wants to support range invalidations in a way 
which works correctly, then it should ideally carry both the TG and TTL 
fields from the guest command straight through to the host. If not, then 
at the very least the host must always assume TTL=0, because it cannot 
correctly infer otherwise once the guest command's original intent has 
been lost.

>> When range invalidation came along, the distinction between "all leaves
>> are definitely at the last level" and "use last-level granularity to
>> make sure everything at at any level is hit" started to matter, but the
>> interface never caught up. It hasn't seemed desperately urgent to fix
>> (who does 1GB+ unmaps outside of VFIO teardown anyway?), but we must
>> definitely not bake the same mistake into user ABI.
>>
>> Of course, there might then be cases where we need to transform
>> non-range commands into range commands for the sake of workarounds, but
>> that's our own problem to deal with.
> 
> Noted it down.
> 
>>>> What about NSNH_ALL? That still needs to invalidate all the S1 context
>>>> that the guest *thinks* it's invalidating.
>>>
>>> NSNH_ALL is translated to NH_ALL at the guest level. But maybe
>>> it should have been done here instead.
>>
>> Yes. It seems the worst of both worlds to have an interface which takes
>> raw opcodes rather than an enum of supported commands, but still
>> requires userspace to know which opcodes are supported and which ones
>> don't work as expected even though they are entirely reasonable to use
>> in the context of the stage-1-only SMMU being emulated.
> 
> Maybe a list of supported TLBI commands via the hw_info uAPI?

I don't think it's all that difficult to implicitly support all commands 
that are valid for a stage-1-only SMMU, it just needs the right 
interface design to be capable of encoding them all completely and 
unambiguously. Coming back to the previous point about the address 
encoding, I think that means basing it more directly on the actual 
SMMUv3 commands, rather than on io-pgtable's abstraction of invalidation 
with SMMUv3 opcodes bolted on.

Thanks,
Robin.
Nicolin Chen March 16, 2023, 12:01 a.m. UTC | #12
On Mon, Mar 13, 2023 at 01:07:42PM +0000, Robin Murphy wrote:
> External email: Use caution opening links or attachments
> 
> 
> On 2023-03-11 12:38, Nicolin Chen wrote:
> > On Fri, Mar 10, 2023 at 05:53:46PM +0000, Robin Murphy wrote:
> > 
> > > > > > +     case CMDQ_OP_TLBI_NH_VA:
> > > > > > +             cmd.tlbi.asid = inv_info->asid;
> > > > > > +             fallthrough;
> > > > > > +     case CMDQ_OP_TLBI_NH_VAA:
> > > > > > +             if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||
> > > > > 
> > > > > Non-range invalidations with TG=0 are perfectly legal, and should not be
> > > > > ignored.
> > > > 
> > > > I assume that you are talking about the pgsize_bitmap check.
> > > > 
> > > > QEMU embeds a !tg case into the granule_size [1]. So it might
> > > > not be straightforward to cover that case. Let me see how to
> > > > untangle different cases and handle them accordingly.
> > > 
> > > Oh, double-checking patch #2, that might be me misunderstanding the
> > > interface. I hadn't realised that the UAPI was apparently modelled on
> > > arm_smmu_tlb_inv_range_asid() rather than actual SMMU commands :)
> > 
> > Yea. In fact, most of the invalidation info in QEMU was packed
> > for the previously defined general cache invalidation structure,
> > and the range invalidation part is still not quite independent.
> > 
> > > I really think UAPI should reflect the hardware and encode TG and TTL
> > > directly. Especially since there's technically a flaw in the current
> > > driver where we assume TTL in cases where it isn't actually known, thus
> > > may potentially fail to invalidate level 2 block entries when removing a
> > > level 1 table, since io-pgtable passes the level 3 granule in that case.
> > 
> > Do you mean something like hw_info forwarding pgsize_bitmap/tg
> > to the guest? Or the other direction?
> 
> I mean if the interface wants to support range invalidations in a way
> which works correctly, then it should ideally carry both the TG and TTL
> fields from the guest command straight through to the host. If not, then
> at the very least the host must always assume TTL=0, because it cannot
> correctly infer otherwise once the guest command's original intent has
> been lost.

Oh, it's about hypervisor simply forwarding the entire CMD to
the host side. Jason is suggesting a fast approach by letting
host kernel read the CMDQ directly to get the raw CMD. Perhaps
that would address this comments about TG/TTL too.

I wonder if there could be other case than a WAR, where TG/TTL
fields from the guest's aren't supported by the host. And then
should the host handle it with a different CMD?

> > > When range invalidation came along, the distinction between "all leaves
> > > are definitely at the last level" and "use last-level granularity to
> > > make sure everything at at any level is hit" started to matter, but the
> > > interface never caught up. It hasn't seemed desperately urgent to fix
> > > (who does 1GB+ unmaps outside of VFIO teardown anyway?), but we must
> > > definitely not bake the same mistake into user ABI.
> > > 
> > > Of course, there might then be cases where we need to transform
> > > non-range commands into range commands for the sake of workarounds, but
> > > that's our own problem to deal with.
> > 
> > Noted it down.
> > 
> > > > > What about NSNH_ALL? That still needs to invalidate all the S1 context
> > > > > that the guest *thinks* it's invalidating.
> > > > 
> > > > NSNH_ALL is translated to NH_ALL at the guest level. But maybe
> > > > it should have been done here instead.
> > > 
> > > Yes. It seems the worst of both worlds to have an interface which takes
> > > raw opcodes rather than an enum of supported commands, but still
> > > requires userspace to know which opcodes are supported and which ones
> > > don't work as expected even though they are entirely reasonable to use
> > > in the context of the stage-1-only SMMU being emulated.
> > 
> > Maybe a list of supported TLBI commands via the hw_info uAPI?
> 
> I don't think it's all that difficult to implicitly support all commands
> that are valid for a stage-1-only SMMU, it just needs the right
> interface design to be capable of encoding them all completely and
> unambiguously. Coming back to the previous point about the address
> encoding, I think that means basing it more directly on the actual
> SMMUv3 commands, rather than on io-pgtable's abstraction of invalidation
> with SMMUv3 opcodes bolted on.

Yea, with the actual commands from the guest, the host can do
something with its supported commands, I think.

Thanks
Nicolin
Robin Murphy March 16, 2023, 2:58 p.m. UTC | #13
On 2023-03-16 00:01, Nicolin Chen wrote:
> On Mon, Mar 13, 2023 at 01:07:42PM +0000, Robin Murphy wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 2023-03-11 12:38, Nicolin Chen wrote:
>>> On Fri, Mar 10, 2023 at 05:53:46PM +0000, Robin Murphy wrote:
>>>
>>>>>>> +     case CMDQ_OP_TLBI_NH_VA:
>>>>>>> +             cmd.tlbi.asid = inv_info->asid;
>>>>>>> +             fallthrough;
>>>>>>> +     case CMDQ_OP_TLBI_NH_VAA:
>>>>>>> +             if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||
>>>>>>
>>>>>> Non-range invalidations with TG=0 are perfectly legal, and should not be
>>>>>> ignored.
>>>>>
>>>>> I assume that you are talking about the pgsize_bitmap check.
>>>>>
>>>>> QEMU embeds a !tg case into the granule_size [1]. So it might
>>>>> not be straightforward to cover that case. Let me see how to
>>>>> untangle different cases and handle them accordingly.
>>>>
>>>> Oh, double-checking patch #2, that might be me misunderstanding the
>>>> interface. I hadn't realised that the UAPI was apparently modelled on
>>>> arm_smmu_tlb_inv_range_asid() rather than actual SMMU commands :)
>>>
>>> Yea. In fact, most of the invalidation info in QEMU was packed
>>> for the previously defined general cache invalidation structure,
>>> and the range invalidation part is still not quite independent.
>>>
>>>> I really think UAPI should reflect the hardware and encode TG and TTL
>>>> directly. Especially since there's technically a flaw in the current
>>>> driver where we assume TTL in cases where it isn't actually known, thus
>>>> may potentially fail to invalidate level 2 block entries when removing a
>>>> level 1 table, since io-pgtable passes the level 3 granule in that case.
>>>
>>> Do you mean something like hw_info forwarding pgsize_bitmap/tg
>>> to the guest? Or the other direction?
>>
>> I mean if the interface wants to support range invalidations in a way
>> which works correctly, then it should ideally carry both the TG and TTL
>> fields from the guest command straight through to the host. If not, then
>> at the very least the host must always assume TTL=0, because it cannot
>> correctly infer otherwise once the guest command's original intent has
>> been lost.
> 
> Oh, it's about hypervisor simply forwarding the entire CMD to
> the host side. Jason is suggesting a fast approach by letting
> host kernel read the CMDQ directly to get the raw CMD. Perhaps
> that would address this comments about TG/TTL too.

That did cross my mind, but given the usage model, having host userspace 
give guest memory whose contents it can't control (unless it pauses the 
whole VM on all CPUs) directly to the host kernel just seems to invite 
more potential problems than necessary. Commands aren't big, so I think 
it's fair to expect the VMM to marshal them into host memory, and save 
the host kernel from ever having to reason about any races or other 
emulation details which may exist between a VM and its VMM.

> I wonder if there could be other case than a WAR, where TG/TTL
> fields from the guest's aren't supported by the host. And then
> should the host handle it with a different CMD?

As Eric found previously, there's a clear benefit in emulating range 
invalidation for guests even if the underlying hardware doesn't support 
it, to minimise trapping. But that's not hard, and the patch as-is is 
already achieving it. All we need to be careful to avoid is issuing 
hardware commands with *less* scope than guest originally asked for - if 
the guest asks for a nonsense TG/TTL which doesn't match its current 
context, that's fine. The interface just has to ensure that a VMM's SMMU 
emulation *is* able to make a nested S1 context behave as expected by 
the architecture; we don't need to care if a guest uses the architecture 
wrong, since it's only hurting itself.
>>>> When range invalidation came along, the distinction between "all leaves
>>>> are definitely at the last level" and "use last-level granularity to
>>>> make sure everything at at any level is hit" started to matter, but the
>>>> interface never caught up. It hasn't seemed desperately urgent to fix
>>>> (who does 1GB+ unmaps outside of VFIO teardown anyway?), but we must
>>>> definitely not bake the same mistake into user ABI.
>>>>
>>>> Of course, there might then be cases where we need to transform
>>>> non-range commands into range commands for the sake of workarounds, but
>>>> that's our own problem to deal with.
>>>
>>> Noted it down.
>>>
>>>>>> What about NSNH_ALL? That still needs to invalidate all the S1 context
>>>>>> that the guest *thinks* it's invalidating.
>>>>>
>>>>> NSNH_ALL is translated to NH_ALL at the guest level. But maybe
>>>>> it should have been done here instead.
>>>>
>>>> Yes. It seems the worst of both worlds to have an interface which takes
>>>> raw opcodes rather than an enum of supported commands, but still
>>>> requires userspace to know which opcodes are supported and which ones
>>>> don't work as expected even though they are entirely reasonable to use
>>>> in the context of the stage-1-only SMMU being emulated.
>>>
>>> Maybe a list of supported TLBI commands via the hw_info uAPI?
>>
>> I don't think it's all that difficult to implicitly support all commands
>> that are valid for a stage-1-only SMMU, it just needs the right
>> interface design to be capable of encoding them all completely and
>> unambiguously. Coming back to the previous point about the address
>> encoding, I think that means basing it more directly on the actual
>> SMMUv3 commands, rather than on io-pgtable's abstraction of invalidation
>> with SMMUv3 opcodes bolted on.
> 
> Yea, with the actual commands from the guest, the host can do
> something with its supported commands, I think.

The one slightly fiddly case, of course, is CMD_SYNC, but I think that's 
just a matter for clear documentation of the expectations and behaviour.

Thanks,
Robin.
Nicolin Chen March 16, 2023, 9:09 p.m. UTC | #14
On Thu, Mar 16, 2023 at 02:58:39PM +0000, Robin Murphy wrote:

> > > > > I really think UAPI should reflect the hardware and encode TG and TTL
> > > > > directly. Especially since there's technically a flaw in the current
> > > > > driver where we assume TTL in cases where it isn't actually known, thus
> > > > > may potentially fail to invalidate level 2 block entries when removing a
> > > > > level 1 table, since io-pgtable passes the level 3 granule in that case.
> > > > 
> > > > Do you mean something like hw_info forwarding pgsize_bitmap/tg
> > > > to the guest? Or the other direction?
> > > 
> > > I mean if the interface wants to support range invalidations in a way
> > > which works correctly, then it should ideally carry both the TG and TTL
> > > fields from the guest command straight through to the host. If not, then
> > > at the very least the host must always assume TTL=0, because it cannot
> > > correctly infer otherwise once the guest command's original intent has
> > > been lost.
> > 
> > Oh, it's about hypervisor simply forwarding the entire CMD to
> > the host side. Jason is suggesting a fast approach by letting
> > host kernel read the CMDQ directly to get the raw CMD. Perhaps
> > that would address this comments about TG/TTL too.
> 
> That did cross my mind, but given the usage model, having host userspace
> give guest memory whose contents it can't control (unless it pauses the
> whole VM on all CPUs) directly to the host kernel just seems to invite
> more potential problems than necessary. Commands aren't big, so I think
> it's fair to expect the VMM to marshal them into host memory, and save
> the host kernel from ever having to reason about any races or other
> emulation details which may exist between a VM and its VMM.

An invalidation ioctl is synchronously executed from the top
level in QEMU when it traps any CMDQ_PROD write. So, either
packing the fields of a command into a data structure or just
forwarding the command directly, it seems to be the same for
the matter of worrying about race conditions?

> > I wonder if there could be other case than a WAR, where TG/TTL
> > fields from the guest's aren't supported by the host. And then
> > should the host handle it with a different CMD?
> 
> As Eric found previously, there's a clear benefit in emulating range
> invalidation for guests even if the underlying hardware doesn't support
> it, to minimise trapping. But that's not hard, and the patch as-is is
> already achieving it. All we need to be careful to avoid is issuing
> hardware commands with *less* scope than guest originally asked for - if
> the guest asks for a nonsense TG/TTL which doesn't match its current
> context, that's fine. The interface just has to ensure that a VMM's SMMU
> emulation *is* able to make a nested S1 context behave as expected by
> the architecture; we don't need to care if a guest uses the architecture
> wrong, since it's only hurting itself.

Agreed. Yet, similar to moving the emulation of TLBI_NSNH_ALL,
from QEMU to the kernel, we could move the emulations of other
TLBI commands to the kernel too? So that a hyperviosr doesn't
need to know the underlying supported TLBI commands by a host,
and then simply relies on the host to emulate the command with
whatever the actual commands that the host can do, addressing
one of your comments mentioned in the conversation below?

> > > > > > > What about NSNH_ALL? That still needs to invalidate all the S1 context
> > > > > > > that the guest *thinks* it's invalidating.
> > > > > > 
> > > > > > NSNH_ALL is translated to NH_ALL at the guest level. But maybe
> > > > > > it should have been done here instead.
> > > > > 
> > > > > Yes. It seems the worst of both worlds to have an interface which takes
> > > > > raw opcodes rather than an enum of supported commands, but still
> > > > > requires userspace to know which opcodes are supported and which ones
> > > > > don't work as expected even though they are entirely reasonable to use
> > > > > in the context of the stage-1-only SMMU being emulated.
> > > > 
> > > > Maybe a list of supported TLBI commands via the hw_info uAPI?
> > > 
> > > I don't think it's all that difficult to implicitly support all commands
> > > that are valid for a stage-1-only SMMU, it just needs the right
> > > interface design to be capable of encoding them all completely and
> > > unambiguously. Coming back to the previous point about the address
> > > encoding, I think that means basing it more directly on the actual
> > > SMMUv3 commands, rather than on io-pgtable's abstraction of invalidation
> > > with SMMUv3 opcodes bolted on.
> > 
> > Yea, with the actual commands from the guest, the host can do
> > something with its supported commands, I think.
> 
> The one slightly fiddly case, of course, is CMD_SYNC, but I think that's
> just a matter for clear documentation of the expectations and behaviour.

What could be odd about CMD_SYNC?

Actually with QEMU, an ioctl for a CMD execution is initiated
by a CMD_PROD write trapped by the QEMU, then a CMD_SYNC only
triggers an IRQ in this setup.

Thanks
Nic
Tian, Kevin March 17, 2023, 9:24 a.m. UTC | #15
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 9, 2023 11:31 PM
> 
> > Also, perhaps I've overlooked something obvious, but what's the
> procedure
> > for reflecting illegal commands back to userspace? Some of the things we're
> > silently ignoring here would be expected to raise CERROR_ILL. Same goes
> for
> > all the other fault events which may occur due to invalid S1 config, come to
> > think of it.
> 
> Perhaps the ioctl should fail and the userpace viommu should inject
> this CERROR_ILL?
> 
> But I'm also wondering if we are making a mistake to not just have the
> kernel driver to expose a SW work queue in its native format and the
> ioctl is only just 'read the queue'. Then it could (asynchronously!)
> push back answers, real or emulated, as well, including all error
> indications.
> 
> I think we got down this synchronous one-ioctl-per-invalidation path
> because that was what the original generic stuff wanted to do. Is it
> what we really want? Kevin what is your perspective?
> 

That's an interesting idea. I think the original synchronous model
also matches how intel-iommu driver works today. In most time
it does synchronous one-invalidation at one time. 

Another problem is how to map invalidation scope in native descriptor
format to affected devices.

VT-d allows per-DID invalidation. This needs extra information to map
vDID to affected devices in the kernel.

It also allows a global invalidation type which invalidate all vDIDs. This
might be easy by simply looping every device bound to the iommufd_ctx.
Tian, Kevin March 17, 2023, 9:41 a.m. UTC | #16
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Saturday, March 11, 2023 12:20 AM
> 
> What I'm broadly thinking is if we have to make the infrastructure for
> VCMDQ HW accelerated invalidation then it is not a big step to also
> have the kernel SW path use the same infrastructure just with a CPU
> wake up instead of a MMIO poke.
> 
> Ie we have a SW version of VCMDQ to speed up SMMUv3 cases without HW
> support.
> 

I thought about this in VT-d context. Looks there are some difficulties.

The most prominent one is that head/tail of the VT-d invalidation queue
are in MMIO registers. Handling it in kernel iommu driver suggests
reading virtual tail register and updating virtual head register. Kind of 
moving some vIOMMU awareness into the kernel which, iirc, is not
a welcomed model.

vhost doesn't have this problem as its vring structure fully resides in
memory including ring tail/head. As long as kernel vhost driver understands
the structure and can send/receive notification to/from kvm then the
in-kernel acceleration works seamlessly.

Not sure whether SMMU has similar obstacle as VT-d. But this is my
impression why vhost-iommu is preferred when talking about such
optimization before.
Tian, Kevin March 17, 2023, 9:47 a.m. UTC | #17
> From: Robin Murphy <robin.murphy@arm.com>
> Sent: Thursday, March 9, 2023 10:49 PM
> > +	case CMDQ_OP_ATC_INV:
> > +		ssid = inv_info->ssid;
> > +		iova = inv_info->range.start;
> > +		size = inv_info->range.last - inv_info->range.start + 1;
> > +		break;
> 
> Can we do any better than multiplying every single ATC_INV command, even
> for random bogus StreamIDs, into multiple commands across every physical
> device? In fact, I'm not entirely confident this isn't problematic, if
> the guest wishes to send invalidations for one device specifically while
> it's put some other device into a state where sending it a command would
> do something bad. At the very least, it's liable to be confusing if the
> guest sends a command for one StreamID but gets an error back for a
> different one.
> 

Or do we need support this cmd at all?

For vt-d we always implicitly invalidate ATC following a iotlb invalidation
request from userspace. Then vIOMMU just treats it as a nop in the
virtual queue.

IMHO a sane iommu driver should always invalidate both iotlb and atc
together. I'm not sure a valid usage where iotlb is invalidated while
atc is left with some stale mappings.
Nicolin Chen March 17, 2023, 2:16 p.m. UTC | #18
On Fri, Mar 17, 2023 at 09:47:47AM +0000, Tian, Kevin wrote:
> External email: Use caution opening links or attachments
> 
> 
> > From: Robin Murphy <robin.murphy@arm.com>
> > Sent: Thursday, March 9, 2023 10:49 PM
> > > +   case CMDQ_OP_ATC_INV:
> > > +           ssid = inv_info->ssid;
> > > +           iova = inv_info->range.start;
> > > +           size = inv_info->range.last - inv_info->range.start + 1;
> > > +           break;
> >
> > Can we do any better than multiplying every single ATC_INV command, even
> > for random bogus StreamIDs, into multiple commands across every physical
> > device? In fact, I'm not entirely confident this isn't problematic, if
> > the guest wishes to send invalidations for one device specifically while
> > it's put some other device into a state where sending it a command would
> > do something bad. At the very least, it's liable to be confusing if the
> > guest sends a command for one StreamID but gets an error back for a
> > different one.
> >
> 
> Or do we need support this cmd at all?
> 
> For vt-d we always implicitly invalidate ATC following a iotlb invalidation
> request from userspace. Then vIOMMU just treats it as a nop in the
> virtual queue.
> 
> IMHO a sane iommu driver should always invalidate both iotlb and atc
> together. I'm not sure a valid usage where iotlb is invalidated while
> atc is left with some stale mappings.

vSMMU code in QEMU actually doesn't forward this command. So,
I guess that you are right about this support here and we may
just drop it.

Thanks!
Nic
Nicolin Chen March 17, 2023, 2:24 p.m. UTC | #19
On Fri, Mar 17, 2023 at 09:41:34AM +0000, Tian, Kevin wrote:
> External email: Use caution opening links or attachments
> 
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Saturday, March 11, 2023 12:20 AM
> >
> > What I'm broadly thinking is if we have to make the infrastructure for
> > VCMDQ HW accelerated invalidation then it is not a big step to also
> > have the kernel SW path use the same infrastructure just with a CPU
> > wake up instead of a MMIO poke.
> >
> > Ie we have a SW version of VCMDQ to speed up SMMUv3 cases without HW
> > support.
> >
> 
> I thought about this in VT-d context. Looks there are some difficulties.
> 
> The most prominent one is that head/tail of the VT-d invalidation queue
> are in MMIO registers. Handling it in kernel iommu driver suggests
> reading virtual tail register and updating virtual head register. Kind of
> moving some vIOMMU awareness into the kernel which, iirc, is not
> a welcomed model.

I had a similar question in another email:
"Firstly, the consumer and producer indexes might need
 to be synced between the host and kernel?"
And Jason replied me with this:
"No, qemu would handles this. The kernel would just read the command
 entries it is told by qemu to read which qemu has already sorted out."

Maybe there is no need of a concern for the head/tail readings?

> vhost doesn't have this problem as its vring structure fully resides in
> memory including ring tail/head. As long as kernel vhost driver understands
> the structure and can send/receive notification to/from kvm then the
> in-kernel acceleration works seamlessly.
> 
> Not sure whether SMMU has similar obstacle as VT-d. But this is my
> impression why vhost-iommu is preferred when talking about such
> optimization before.

SMMU has a similar pair of head/tail pointers to the invalidation
queue (consumer/producer indexes and command queue in SMMU term).

Thanks
Nic
Nicolin Chen March 20, 2023, 1:32 a.m. UTC | #20
On Thu, Mar 16, 2023 at 02:09:08PM -0700, Nicolin Chen wrote:
> On Thu, Mar 16, 2023 at 02:58:39PM +0000, Robin Murphy wrote:
> 
> > > > > > I really think UAPI should reflect the hardware and encode TG and TTL
> > > > > > directly. Especially since there's technically a flaw in the current
> > > > > > driver where we assume TTL in cases where it isn't actually known, thus
> > > > > > may potentially fail to invalidate level 2 block entries when removing a
> > > > > > level 1 table, since io-pgtable passes the level 3 granule in that case.
> > > > > 
> > > > > Do you mean something like hw_info forwarding pgsize_bitmap/tg
> > > > > to the guest? Or the other direction?
> > > > 
> > > > I mean if the interface wants to support range invalidations in a way
> > > > which works correctly, then it should ideally carry both the TG and TTL
> > > > fields from the guest command straight through to the host. If not, then
> > > > at the very least the host must always assume TTL=0, because it cannot
> > > > correctly infer otherwise once the guest command's original intent has
> > > > been lost.
> > > 
> > > Oh, it's about hypervisor simply forwarding the entire CMD to
> > > the host side. Jason is suggesting a fast approach by letting
> > > host kernel read the CMDQ directly to get the raw CMD. Perhaps
> > > that would address this comments about TG/TTL too.
> > 
> > That did cross my mind, but given the usage model, having host userspace
> > give guest memory whose contents it can't control (unless it pauses the
> > whole VM on all CPUs) directly to the host kernel just seems to invite
> > more potential problems than necessary. Commands aren't big, so I think
> > it's fair to expect the VMM to marshal them into host memory, and save
> > the host kernel from ever having to reason about any races or other
> > emulation details which may exist between a VM and its VMM.
> 
> An invalidation ioctl is synchronously executed from the top
> level in QEMU when it traps any CMDQ_PROD write. So, either
> packing the fields of a command into a data structure or just
> forwarding the command directly, it seems to be the same for
> the matter of worrying about race conditions?

I think I misread your reply here :)

What you suggested is exactly forwarding the command v.s. host
reading guest's command queue memory.

Although I haven't fully got what Jason's "sorting" approach,
this could already simplify the data structure holding all the
fields, by passing a "__u64 cmds[2]" alone. A sample code:

+struct iommu_hwpt_invalidate_arm_smmuv3 {
+       struct iommu_iova_range range;
+       __u64 cmd[2];
+};

then...

+       cmd[0] = inv_info->cmd[0];
+       cmd[1] = inv_info->cmd[1];
+       switch (cmd[0] & 0xff) {
+       case CMDQ_OP_TLBI_NSNH_ALL:
+               cmd[0] &= ~0xffULL;
+               cmd[0] |= CMDQ_OP_TLBI_NH_ALL;
+               fallthrough;
+       case CMDQ_OP_TLBI_NH_VA:
+       case CMDQ_OP_TLBI_NH_VAA:
+       case CMDQ_OP_TLBI_NH_ALL:
+       case CMDQ_OP_TLBI_NH_ASID:
+               cmd[0] &= ~CMDQ_TLBI_0_VMID;
+               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, smmu_domain->s2->s2_cfg.vmid);
+               arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, true);
+               break;
+       case CMDQ_OP_CFGI_CD:
+       case CMDQ_OP_CFGI_CD_ALL:
+               arm_smmu_sync_cd(smmu_domain,
+                                FIELD_GET(CMDQ_CFGI_0_SSID, cmd[0]), false);
+               break;
+       default:
+               return;
+       }

We could probably do a batch forwarding to if it's worthy?

Thanks
Nic
Jason Gunthorpe March 20, 2023, 12:59 p.m. UTC | #21
On Fri, Mar 17, 2023 at 09:41:34AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Saturday, March 11, 2023 12:20 AM
> > 
> > What I'm broadly thinking is if we have to make the infrastructure for
> > VCMDQ HW accelerated invalidation then it is not a big step to also
> > have the kernel SW path use the same infrastructure just with a CPU
> > wake up instead of a MMIO poke.
> > 
> > Ie we have a SW version of VCMDQ to speed up SMMUv3 cases without HW
> > support.
> > 
> 
> I thought about this in VT-d context. Looks there are some difficulties.
> 
> The most prominent one is that head/tail of the VT-d invalidation queue
> are in MMIO registers. Handling it in kernel iommu driver suggests
> reading virtual tail register and updating virtual head register. Kind of 
> moving some vIOMMU awareness into the kernel which, iirc, is not
> a welcomed model.

qemu would trap the MMIO and generate an IOCTL with the written head
pointer. It isn't as efficient as having the kernel do the trap, but
does give batching.

Jason
Jason Gunthorpe March 20, 2023, 1:03 p.m. UTC | #22
On Sat, Mar 11, 2023 at 03:56:50AM -0800, Nicolin Chen wrote:

> I recall that one difficulty is to pass the vSID from the guest
> down to the host kernel driver and to link with the pSID. What I
> did previously for VCMDQ was to set the SID_MATCH register with
> iommu_group_id(group) and set the SID_REPLACE register with the
> pSID. Then hyper will use the iommu_group_id to search for the
> pair of the registers, and to set vSID. Perhaps we should think
> of something smarter.

We need an ioctl for this, I think. To load a map of vSID to dev_id
into the driver. Kernel will convert dev_id to pSID. Driver will
program the map into HW.

SW path will program the map into an xarray

> > I suspect the answer to Robin's question on how to handle errors is
> > the most important deciding factor. If we have to capture and relay
> > actual HW errors back to userspace that really suggests we should do
> > something different than a synchronous ioctl.
> 
> A synchronous ioctl is to return some values other than defining
> cache_invalidate_user as void, like we are doing now? An fault
> injection pathway to report CERROR asynchronously is what we've
> been doing though -- even with Eric's previous VFIO solution.

Where is this? How does it look?

Jason
Jason Gunthorpe March 20, 2023, 1:11 p.m. UTC | #23
On Sun, Mar 19, 2023 at 06:32:03PM -0700, Nicolin Chen wrote:

> +struct iommu_hwpt_invalidate_arm_smmuv3 {
> +       struct iommu_iova_range range;

what is this?

> +       __u64 cmd[2];
> +};

You still have to do something with the SID. We can't just allow any
un-validated SID value - the driver has to check the incoming SID
against allowed SIDs for this iommufd_ctx

Jason
Nicolin Chen March 20, 2023, 3:28 p.m. UTC | #24
On Mon, Mar 20, 2023 at 10:11:54AM -0300, Jason Gunthorpe wrote:
> On Sun, Mar 19, 2023 at 06:32:03PM -0700, Nicolin Chen wrote:
> 
> > +struct iommu_hwpt_invalidate_arm_smmuv3 {
> > +       struct iommu_iova_range range;
> 
> what is this?

Not used. A copy-n-paste mistake :(

> 
> > +       __u64 cmd[2];
> > +};
> 
> You still have to do something with the SID. We can't just allow any
> un-validated SID value - the driver has to check the incoming SID
> against allowed SIDs for this iommufd_ctx

Hmm, that's something "missing" even in the current design.

Yet, most of the TLBI commands don't hold an SID field. So,
the hypervisor only trapping a queue write-pointer movement
cannot get the exact vSID for a TLBI command. What our QEMU
code currently does is simply broadcasting all the devices
on the list of attaching devices to the vSMMU, which means
that such an enforcement in the kernel would basically just
allow any vSID (device) that's attached to the domain?

Thanks
Nic
Nicolin Chen March 20, 2023, 3:56 p.m. UTC | #25
On Mon, Mar 20, 2023 at 10:03:04AM -0300, Jason Gunthorpe wrote:
> On Sat, Mar 11, 2023 at 03:56:50AM -0800, Nicolin Chen wrote:
> 
> > I recall that one difficulty is to pass the vSID from the guest
> > down to the host kernel driver and to link with the pSID. What I
> > did previously for VCMDQ was to set the SID_MATCH register with
> > iommu_group_id(group) and set the SID_REPLACE register with the
> > pSID. Then hyper will use the iommu_group_id to search for the
> > pair of the registers, and to set vSID. Perhaps we should think
> > of something smarter.
> 
> We need an ioctl for this, I think. To load a map of vSID to dev_id
> into the driver. Kernel will convert dev_id to pSID. Driver will
> program the map into HW.

Can we just pass a vSID via the alloc ioctl like this?

-----------------------------------------------------------
@@ -429,7 +429,7 @@ struct iommu_hwpt_arm_smmuv3 {
 #define IOMMU_SMMUV3_FLAG_VMID (1 << 1) /* vmid override */
        __u64 flags;
        __u32 s2vmid;
-       __u32 __reserved;
+       __u32 sid;
        __u64 s1ctxptr;
        __u64 s1cdmax;
        __u64 s1fmt;
-----------------------------------------------------------

An alloc is initiated by an SMMU_CMD_CFGI_STE command that has
an SID filed anyway.

> SW path will program the map into an xarray

I found a tricky thing about SIDs in the SMMU driver when doing
this experiment: the SMMU kernel driver mostly handles devices
using struct arm_smmu_master. However, an arm_smmu_master might
have a num_streams>1, meaning a device can have multiple SIDs.
Though it seems that PCI devices might not be in this scope, a
plain xarray might not work for other type of devices in a long
run, if there'd be?

> > > I suspect the answer to Robin's question on how to handle errors is
> > > the most important deciding factor. If we have to capture and relay
> > > actual HW errors back to userspace that really suggests we should do
> > > something different than a synchronous ioctl.
> > 
> > A synchronous ioctl is to return some values other than defining
> > cache_invalidate_user as void, like we are doing now? An fault
> > injection pathway to report CERROR asynchronously is what we've
> > been doing though -- even with Eric's previous VFIO solution.
> 
> Where is this? How does it look?

That's postponed with the PRI support, right? My use case does
not need PRI actually, but a fault injection pathway to guests.
This pathway should be able to take care of any CERROR (detected
by a host interrupt) or something funky in cache_invalidate_user
requests itself?

Thanks
Nic
Jason Gunthorpe March 20, 2023, 4:01 p.m. UTC | #26
On Mon, Mar 20, 2023 at 08:28:05AM -0700, Nicolin Chen wrote:
> On Mon, Mar 20, 2023 at 10:11:54AM -0300, Jason Gunthorpe wrote:
> > On Sun, Mar 19, 2023 at 06:32:03PM -0700, Nicolin Chen wrote:
> > 
> > > +struct iommu_hwpt_invalidate_arm_smmuv3 {
> > > +       struct iommu_iova_range range;
> > 
> > what is this?
> 
> Not used. A copy-n-paste mistake :(
> 
> > 
> > > +       __u64 cmd[2];
> > > +};
> > 
> > You still have to do something with the SID. We can't just allow any
> > un-validated SID value - the driver has to check the incoming SID
> > against allowed SIDs for this iommufd_ctx
> 
> Hmm, that's something "missing" even in the current design.
> 
> Yet, most of the TLBI commands don't hold an SID field. So,
> the hypervisor only trapping a queue write-pointer movement
> cannot get the exact vSID for a TLBI command. What our QEMU
> code currently does is simply broadcasting all the devices
> on the list of attaching devices to the vSMMU, which means
> that such an enforcement in the kernel would basically just
> allow any vSID (device) that's attached to the domain?

SID is only used for managing the ATC as far as I know. It is because
the ASID doesn't convey enough information to determine what PCI RID
to generate an ATC invalidation for.

We shouldn't be broadcasting for efficiency, at least it should not be
baked into the API.

You need to know what devices the vSID is targetting ang issues
invalidations only for those devices.

Jason
Jason Gunthorpe March 20, 2023, 4:04 p.m. UTC | #27
On Mon, Mar 20, 2023 at 08:56:00AM -0700, Nicolin Chen wrote:
> On Mon, Mar 20, 2023 at 10:03:04AM -0300, Jason Gunthorpe wrote:
> > On Sat, Mar 11, 2023 at 03:56:50AM -0800, Nicolin Chen wrote:
> > 
> > > I recall that one difficulty is to pass the vSID from the guest
> > > down to the host kernel driver and to link with the pSID. What I
> > > did previously for VCMDQ was to set the SID_MATCH register with
> > > iommu_group_id(group) and set the SID_REPLACE register with the
> > > pSID. Then hyper will use the iommu_group_id to search for the
> > > pair of the registers, and to set vSID. Perhaps we should think
> > > of something smarter.
> > 
> > We need an ioctl for this, I think. To load a map of vSID to dev_id
> > into the driver. Kernel will convert dev_id to pSID. Driver will
> > program the map into HW.
> 
> Can we just pass a vSID via the alloc ioctl like this?
> 
> -----------------------------------------------------------
> @@ -429,7 +429,7 @@ struct iommu_hwpt_arm_smmuv3 {
>  #define IOMMU_SMMUV3_FLAG_VMID (1 << 1) /* vmid override */
>         __u64 flags;
>         __u32 s2vmid;
> -       __u32 __reserved;
> +       __u32 sid;
>         __u64 s1ctxptr;
>         __u64 s1cdmax;
>         __u64 s1fmt;
> -----------------------------------------------------------
> 
> An alloc is initiated by an SMMU_CMD_CFGI_STE command that has
> an SID filed anyway.

No, a HWPT is not a device or a SID. a HWPT is an ASID in the ARM
model.

dev_id is the SID.

The cfgi_ste will carry the vSID which is mapped to a iommufd dev_id.

The kernel has to translate the vSID to the dev_id to the pSID to
issue an ATC invalidation for the correct entity.

> > SW path will program the map into an xarray
> 
> I found a tricky thing about SIDs in the SMMU driver when doing
> this experiment: the SMMU kernel driver mostly handles devices
> using struct arm_smmu_master. However, an arm_smmu_master might
> have a num_streams>1, meaning a device can have multiple SIDs.
> Though it seems that PCI devices might not be in this scope, a
> plain xarray might not work for other type of devices in a long
> run, if there'd be?

You'd replicate each of the vSIDs of the extra SIDs in the xarray.

> > > cache_invalidate_user as void, like we are doing now? An fault
> > > injection pathway to report CERROR asynchronously is what we've
> > > been doing though -- even with Eric's previous VFIO solution.
> > 
> > Where is this? How does it look?
> 
> That's postponed with the PRI support, right? My use case does
> not need PRI actually, but a fault injection pathway to guests.
> This pathway should be able to take care of any CERROR (detected
> by a host interrupt) or something funky in cache_invalidate_user
> requests itself?

I would expect that if invalidation can fail that we have a way to
signal that failure back to the guest.

Jason
Nicolin Chen March 20, 2023, 4:12 p.m. UTC | #28
On Mon, Mar 20, 2023 at 09:59:23AM -0300, Jason Gunthorpe wrote:
> On Fri, Mar 17, 2023 at 09:41:34AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Saturday, March 11, 2023 12:20 AM
> > > 
> > > What I'm broadly thinking is if we have to make the infrastructure for
> > > VCMDQ HW accelerated invalidation then it is not a big step to also
> > > have the kernel SW path use the same infrastructure just with a CPU
> > > wake up instead of a MMIO poke.
> > > 
> > > Ie we have a SW version of VCMDQ to speed up SMMUv3 cases without HW
> > > support.
> > > 
> > 
> > I thought about this in VT-d context. Looks there are some difficulties.
> > 
> > The most prominent one is that head/tail of the VT-d invalidation queue
> > are in MMIO registers. Handling it in kernel iommu driver suggests
> > reading virtual tail register and updating virtual head register. Kind of 
> > moving some vIOMMU awareness into the kernel which, iirc, is not
> > a welcomed model.
> 
> qemu would trap the MMIO and generate an IOCTL with the written head
> pointer. It isn't as efficient as having the kernel do the trap, but
> does give batching.

Rephrasing that to put into a design: the IOCTL would pass a
user pointer to the queue, the size of the queue, then a head
pointer and a tail pointer? Then the kernel reads out all the
commands between the head and the tail and handles all those
invalidation commands only?

Thanks
Nic
Nicolin Chen March 20, 2023, 4:35 p.m. UTC | #29
On Mon, Mar 20, 2023 at 01:01:53PM -0300, Jason Gunthorpe wrote:
> On Mon, Mar 20, 2023 at 08:28:05AM -0700, Nicolin Chen wrote:
> > On Mon, Mar 20, 2023 at 10:11:54AM -0300, Jason Gunthorpe wrote:
> > > On Sun, Mar 19, 2023 at 06:32:03PM -0700, Nicolin Chen wrote:
> > > 
> > > > +struct iommu_hwpt_invalidate_arm_smmuv3 {
> > > > +       struct iommu_iova_range range;
> > > 
> > > what is this?
> > 
> > Not used. A copy-n-paste mistake :(
> > 
> > > 
> > > > +       __u64 cmd[2];
> > > > +};
> > > 
> > > You still have to do something with the SID. We can't just allow any
> > > un-validated SID value - the driver has to check the incoming SID
> > > against allowed SIDs for this iommufd_ctx
> > 
> > Hmm, that's something "missing" even in the current design.
> > 
> > Yet, most of the TLBI commands don't hold an SID field. So,
> > the hypervisor only trapping a queue write-pointer movement
> > cannot get the exact vSID for a TLBI command. What our QEMU
> > code currently does is simply broadcasting all the devices
> > on the list of attaching devices to the vSMMU, which means
> > that such an enforcement in the kernel would basically just
> > allow any vSID (device) that's attached to the domain?
> 
> SID is only used for managing the ATC as far as I know. It is because
> the ASID doesn't convey enough information to determine what PCI RID
> to generate an ATC invalidation for.

Yes. And a CD invalidation too, though the kernel eventually
would do a broadcast to all devices that are using the same
CD.

> We shouldn't be broadcasting for efficiency, at least it should not be
> baked into the API.
> 
> You need to know what devices the vSID is targetting ang issues
> invalidations only for those devices.

I agree with that, yet cannot think of a solution to achieve
that out of vSID. QEMU code by means of emulating a physical
SMMU only reads the commands from the queue, without knowing
which device (vSID) actually sent these commands.

I probably can do something to the solution that is doing an
entire broadcasting, with the ASID fields from the commands,
yet it'd only improve the situation by having an ASID-based
broadcasting...

Thanks
Nic
Nicolin Chen March 20, 2023, 4:59 p.m. UTC | #30
On Mon, Mar 20, 2023 at 01:04:35PM -0300, Jason Gunthorpe wrote:

> > > We need an ioctl for this, I think. To load a map of vSID to dev_id
> > > into the driver. Kernel will convert dev_id to pSID. Driver will
> > > program the map into HW.
> > 
> > Can we just pass a vSID via the alloc ioctl like this?
> > 
> > -----------------------------------------------------------
> > @@ -429,7 +429,7 @@ struct iommu_hwpt_arm_smmuv3 {
> >  #define IOMMU_SMMUV3_FLAG_VMID (1 << 1) /* vmid override */
> >         __u64 flags;
> >         __u32 s2vmid;
> > -       __u32 __reserved;
> > +       __u32 sid;
> >         __u64 s1ctxptr;
> >         __u64 s1cdmax;
> >         __u64 s1fmt;
> > -----------------------------------------------------------
> > 
> > An alloc is initiated by an SMMU_CMD_CFGI_STE command that has
> > an SID filed anyway.
> 
> No, a HWPT is not a device or a SID. a HWPT is an ASID in the ARM
> model.
> 
> dev_id is the SID.
> 
> The cfgi_ste will carry the vSID which is mapped to a iommufd dev_id.
> 
> The kernel has to translate the vSID to the dev_id to the pSID to
> issue an ATC invalidation for the correct entity.

OK. This narrative makes sense. I think our solution (the entire
stack) here mixes these two terms between HWPT/ASID and STE/SID.

What QEMU does is trapping an SMMU_CMD_CFGI_STE command to send
the host an HWPT alloc ioctl. The former one is based on an SID
or a device, while the latter one is based on ASID.

So the correct way should be for QEMU to maintain an ASID-based
list, corresponding to the s1ctxptr from STEs, and only send an
alloc ioctl upon a new s1ctxptr/ASID. Meanwhile, at every trap
of SMMU_CMD_CFGI_STE, it calls a separate ioctl to tie a vSID to
a dev_id (and pSID accordingly).

In another word, an SMMU_CMD_CFGI_STE should do a mandatory SID
ioctl and an optional HWPT alloc ioctl (only allocates a HWPT if
the s1ctxptr in the STE is new).

What could be a good prototype of the ioctl? Would it be a VFIO
device one or IOMMUFD one?

> > > SW path will program the map into an xarray
> > 
> > I found a tricky thing about SIDs in the SMMU driver when doing
> > this experiment: the SMMU kernel driver mostly handles devices
> > using struct arm_smmu_master. However, an arm_smmu_master might
> > have a num_streams>1, meaning a device can have multiple SIDs.
> > Though it seems that PCI devices might not be in this scope, a
> > plain xarray might not work for other type of devices in a long
> > run, if there'd be?
> 
> You'd replicate each of the vSIDs of the extra SIDs in the xarray.

Noted it down.

> > > > cache_invalidate_user as void, like we are doing now? An fault
> > > > injection pathway to report CERROR asynchronously is what we've
> > > > been doing though -- even with Eric's previous VFIO solution.
> > > 
> > > Where is this? How does it look?
> > 
> > That's postponed with the PRI support, right? My use case does
> > not need PRI actually, but a fault injection pathway to guests.
> > This pathway should be able to take care of any CERROR (detected
> > by a host interrupt) or something funky in cache_invalidate_user
> > requests itself?
> 
> I would expect that if invalidation can fail that we have a way to
> signal that failure back to the guest.

That's plausible to me, and it could apply to a translation
fault too. So, should we add back the iommufd infrastructure
for the fault injection (without PRI), in v2?

Thanks
Nic
Jason Gunthorpe March 20, 2023, 6 p.m. UTC | #31
On Mon, Mar 20, 2023 at 09:12:06AM -0700, Nicolin Chen wrote:
> On Mon, Mar 20, 2023 at 09:59:23AM -0300, Jason Gunthorpe wrote:
> > On Fri, Mar 17, 2023 at 09:41:34AM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > Sent: Saturday, March 11, 2023 12:20 AM
> > > > 
> > > > What I'm broadly thinking is if we have to make the infrastructure for
> > > > VCMDQ HW accelerated invalidation then it is not a big step to also
> > > > have the kernel SW path use the same infrastructure just with a CPU
> > > > wake up instead of a MMIO poke.
> > > > 
> > > > Ie we have a SW version of VCMDQ to speed up SMMUv3 cases without HW
> > > > support.
> > > > 
> > > 
> > > I thought about this in VT-d context. Looks there are some difficulties.
> > > 
> > > The most prominent one is that head/tail of the VT-d invalidation queue
> > > are in MMIO registers. Handling it in kernel iommu driver suggests
> > > reading virtual tail register and updating virtual head register. Kind of 
> > > moving some vIOMMU awareness into the kernel which, iirc, is not
> > > a welcomed model.
> > 
> > qemu would trap the MMIO and generate an IOCTL with the written head
> > pointer. It isn't as efficient as having the kernel do the trap, but
> > does give batching.
> 
> Rephrasing that to put into a design: the IOCTL would pass a
> user pointer to the queue, the size of the queue, then a head
> pointer and a tail pointer? Then the kernel reads out all the
> commands between the head and the tail and handles all those
> invalidation commands only?

Yes, that is one possible design

Jason
Jason Gunthorpe March 20, 2023, 6:07 p.m. UTC | #32
On Mon, Mar 20, 2023 at 09:35:20AM -0700, Nicolin Chen wrote:

> > You need to know what devices the vSID is targetting ang issues
> > invalidations only for those devices.
> 
> I agree with that, yet cannot think of a solution to achieve
> that out of vSID. QEMU code by means of emulating a physical
> SMMU only reads the commands from the queue, without knowing
> which device (vSID) actually sent these commands.

Huh?

CMD_ATC_INV has the SID

Other commands have the ASID.

You never need to cross an ASID to a SID or vice versa.

If the guest is aware of ATS it will issue CMD_ATC_INV with vSIDs, and
the hypervisor just needs to convert vSID to pSID.

Otherwise vSID doesn't matter because it isn't used in the invalidation
API and you are just handling ASIDs that only need the VM_ID scope
applied.

Jason
Jason Gunthorpe March 20, 2023, 6:45 p.m. UTC | #33
On Mon, Mar 20, 2023 at 09:59:45AM -0700, Nicolin Chen wrote:
> On Mon, Mar 20, 2023 at 01:04:35PM -0300, Jason Gunthorpe wrote:
> 
> > > > We need an ioctl for this, I think. To load a map of vSID to dev_id
> > > > into the driver. Kernel will convert dev_id to pSID. Driver will
> > > > program the map into HW.
> > > 
> > > Can we just pass a vSID via the alloc ioctl like this?
> > > 
> > > -----------------------------------------------------------
> > > @@ -429,7 +429,7 @@ struct iommu_hwpt_arm_smmuv3 {
> > >  #define IOMMU_SMMUV3_FLAG_VMID (1 << 1) /* vmid override */
> > >         __u64 flags;
> > >         __u32 s2vmid;
> > > -       __u32 __reserved;
> > > +       __u32 sid;
> > >         __u64 s1ctxptr;
> > >         __u64 s1cdmax;
> > >         __u64 s1fmt;
> > > -----------------------------------------------------------
> > > 
> > > An alloc is initiated by an SMMU_CMD_CFGI_STE command that has
> > > an SID filed anyway.
> > 
> > No, a HWPT is not a device or a SID. a HWPT is an ASID in the ARM
> > model.
> > 
> > dev_id is the SID.
> > 
> > The cfgi_ste will carry the vSID which is mapped to a iommufd dev_id.
> > 
> > The kernel has to translate the vSID to the dev_id to the pSID to
> > issue an ATC invalidation for the correct entity.
> 
> OK. This narrative makes sense. I think our solution (the entire
> stack) here mixes these two terms between HWPT/ASID and STE/SID.

HWPT is an "ASID/DID" on Intel and a CD table on SMMUv3

> What QEMU does is trapping an SMMU_CMD_CFGI_STE command to send
> the host an HWPT alloc ioctl. The former one is based on an SID
> or a device, while the latter one is based on ASID.
> 
> So the correct way should be for QEMU to maintain an ASID-based
> list, corresponding to the s1ctxptr from STEs, and only send an
> alloc ioctl upon a new s1ctxptr/ASID. Meanwhile, at every trap
> of SMMU_CMD_CFGI_STE, it calls a separate ioctl to tie a vSID to
> a dev_id (and pSID accordingly).

It is not ASID, it just s1ctxptr's - de-duplicate them.

Do something about SMMUv3 not being able to interwork iommu_domains
across instances

> In another word, an SMMU_CMD_CFGI_STE should do a mandatory SID
> ioctl and an optional HWPT alloc ioctl (only allocates a HWPT if
> the s1ctxptr in the STE is new).

No, there is no SID ioctl at the STE stage.

The vSID was decided by qemu before the VM booted. It created it when
it built the vRID and the vPCI device. The vSID is tied to the vfio
device FD.

Somehow the VM knows the relationship between vSID and vPCI/vRID. IIRC
this is passed in through ACPI from qemu.

So vSID is an alais for the dev_id in iommfd language, and quemu
always has a translation table for it.

So CFGI_STE maps to allocating a de-duplicated HWPT for the CD table,
and then a replace operation on the device FD represented by the vSID
to change the pSTE to point to the HWPT.

The HWPT is effectively the "shadow STE".

> What could be a good prototype of the ioctl? Would it be a VFIO
> device one or IOMMUFD one?

If we load the vSID table it should be a iommufd one, linked to the
ARM SMMUv3 driver and probably take in a pointer to an array of
vSID/dev_id pairs. Maybe an add/remove type of operation.

> > I would expect that if invalidation can fail that we have a way to
> > signal that failure back to the guest.
> 
> That's plausible to me, and it could apply to a translation
> fault too. So, should we add back the iommufd infrastructure
> for the fault injection (without PRI), in v2?

It would be nice if things were not so big, I don't think we need to
tackle translation fault at this time, but we should be thinking about
what invalidation cmd fault converts into.

Jason
Nicolin Chen March 20, 2023, 8:46 p.m. UTC | #34
On Mon, Mar 20, 2023 at 03:07:13PM -0300, Jason Gunthorpe wrote:
> On Mon, Mar 20, 2023 at 09:35:20AM -0700, Nicolin Chen wrote:
> 
> > > You need to know what devices the vSID is targetting ang issues
> > > invalidations only for those devices.
> > 
> > I agree with that, yet cannot think of a solution to achieve
> > that out of vSID. QEMU code by means of emulating a physical
> > SMMU only reads the commands from the queue, without knowing
> > which device (vSID) actually sent these commands.
> 
> Huh?
> 
> CMD_ATC_INV has the SID
> 
> Other commands have the ASID.
> 
> You never need to cross an ASID to a SID or vice versa.
> 
> If the guest is aware of ATS it will issue CMD_ATC_INV with vSIDs, and
> the hypervisor just needs to convert vSID to pSID.
> 
> Otherwise vSID doesn't matter because it isn't used in the invalidation
> API and you are just handling ASIDs that only need the VM_ID scope
> applied.

Yea, I was thinking of your point (at the top) how we could
ensure if an invalidation is targeting a correct vSID. So,
that narrative was only about CMD_ATC_INV...

Actually, we don't forward CMD_ATC_INV in QEMU. In another
thread, Kevin also remarked whether we need to support that
in the host or not. And I plan to drop CMD_ATC_INV from the
list of cache_invalidate_user(), following his comments and
the QEMU situation. Our uAPI, either forwarding the commands
or a package of queue info, should be able to cover this in
the future whenever we think it's required.

Combining the two parts above, we probably don't need to know
at this moment which vSID an invalidation is targeting, nor
to only allow it to execute for those devices, since the rest
of commands are all ASID based.

Thanks
Nic
Nicolin Chen March 20, 2023, 9:22 p.m. UTC | #35
On Mon, Mar 20, 2023 at 03:45:54PM -0300, Jason Gunthorpe wrote:
> On Mon, Mar 20, 2023 at 09:59:45AM -0700, Nicolin Chen wrote:
> > On Mon, Mar 20, 2023 at 01:04:35PM -0300, Jason Gunthorpe wrote:
> > 
> > > > > We need an ioctl for this, I think. To load a map of vSID to dev_id
> > > > > into the driver. Kernel will convert dev_id to pSID. Driver will
> > > > > program the map into HW.
> > > > 
> > > > Can we just pass a vSID via the alloc ioctl like this?
> > > > 
> > > > -----------------------------------------------------------
> > > > @@ -429,7 +429,7 @@ struct iommu_hwpt_arm_smmuv3 {
> > > >  #define IOMMU_SMMUV3_FLAG_VMID (1 << 1) /* vmid override */
> > > >         __u64 flags;
> > > >         __u32 s2vmid;
> > > > -       __u32 __reserved;
> > > > +       __u32 sid;
> > > >         __u64 s1ctxptr;
> > > >         __u64 s1cdmax;
> > > >         __u64 s1fmt;
> > > > -----------------------------------------------------------
> > > > 
> > > > An alloc is initiated by an SMMU_CMD_CFGI_STE command that has
> > > > an SID filed anyway.
> > > 
> > > No, a HWPT is not a device or a SID. a HWPT is an ASID in the ARM
> > > model.
> > > 
> > > dev_id is the SID.
> > > 
> > > The cfgi_ste will carry the vSID which is mapped to a iommufd dev_id.
> > > 
> > > The kernel has to translate the vSID to the dev_id to the pSID to
> > > issue an ATC invalidation for the correct entity.
> > 
> > OK. This narrative makes sense. I think our solution (the entire
> > stack) here mixes these two terms between HWPT/ASID and STE/SID.
> 
> HWPT is an "ASID/DID" on Intel and a CD table on SMMUv3
> 
> > What QEMU does is trapping an SMMU_CMD_CFGI_STE command to send
> > the host an HWPT alloc ioctl. The former one is based on an SID
> > or a device, while the latter one is based on ASID.
> > 
> > So the correct way should be for QEMU to maintain an ASID-based
> > list, corresponding to the s1ctxptr from STEs, and only send an
> > alloc ioctl upon a new s1ctxptr/ASID. Meanwhile, at every trap
> > of SMMU_CMD_CFGI_STE, it calls a separate ioctl to tie a vSID to
> > a dev_id (and pSID accordingly).
> 
> It is not ASID, it just s1ctxptr's - de-duplicate them.

SMMU has "ASID" too. And it's one per CD table. It can be also
seen as one per iommu_domain.

The following are lines from arm_smmu_domain_finalise_s1():
	...
	ret = xa_alloc(&arm_smmu_asid_xa, &asid, &cfg->cd,
		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
	...
	cfg->cd.asid    = (u16)asid;
	...

> Do something about SMMUv3 not being able to interwork iommu_domains
> across instances

I don't follow this one. Device instances?

> > In another word, an SMMU_CMD_CFGI_STE should do a mandatory SID
> > ioctl and an optional HWPT alloc ioctl (only allocates a HWPT if
> > the s1ctxptr in the STE is new).
> 
> No, there is no SID ioctl at the STE stage.
> 
> The vSID was decided by qemu before the VM booted. It created it when
> it built the vRID and the vPCI device. The vSID is tied to the vfio
> device FD.
> 
> Somehow the VM knows the relationship between vSID and vPCI/vRID. IIRC
> this is passed in through ACPI from qemu.

Yes.

> So vSID is an alais for the dev_id in iommfd language, and quemu
> always has a translation table for it.

I see.

> So CFGI_STE maps to allocating a de-duplicated HWPT for the CD table,
> and then a replace operation on the device FD represented by the vSID
> to change the pSTE to point to the HWPT.
> 
> The HWPT is effectively the "shadow STE".

IIUIC, the ioctl for the link of vSID/dev_id should happen at
the stage when boot boots, while the HWPT alloc ioctl happens
at CFGI_STE.

> > What could be a good prototype of the ioctl? Would it be a VFIO
> > device one or IOMMUFD one?
> 
> If we load the vSID table it should be a iommufd one, linked to the
> ARM SMMUv3 driver and probably take in a pointer to an array of
> vSID/dev_id pairs. Maybe an add/remove type of operation.

Will try some solution.

> > > I would expect that if invalidation can fail that we have a way to
> > > signal that failure back to the guest.
> > 
> > That's plausible to me, and it could apply to a translation
> > fault too. So, should we add back the iommufd infrastructure
> > for the fault injection (without PRI), in v2?
> 
> It would be nice if things were not so big, I don't think we need to
> tackle translation fault at this time, but we should be thinking about
> what invalidation cmd fault converts into.

Will see if we can add a compact one, or some other solution
for invalidation fault only.

Thanks
Nic
Jason Gunthorpe March 20, 2023, 10:14 p.m. UTC | #36
On Mon, Mar 20, 2023 at 01:46:52PM -0700, Nicolin Chen wrote:
> On Mon, Mar 20, 2023 at 03:07:13PM -0300, Jason Gunthorpe wrote:
> > On Mon, Mar 20, 2023 at 09:35:20AM -0700, Nicolin Chen wrote:
> > 
> > > > You need to know what devices the vSID is targetting ang issues
> > > > invalidations only for those devices.
> > > 
> > > I agree with that, yet cannot think of a solution to achieve
> > > that out of vSID. QEMU code by means of emulating a physical
> > > SMMU only reads the commands from the queue, without knowing
> > > which device (vSID) actually sent these commands.
> > 
> > Huh?
> > 
> > CMD_ATC_INV has the SID
> > 
> > Other commands have the ASID.
> > 
> > You never need to cross an ASID to a SID or vice versa.
> > 
> > If the guest is aware of ATS it will issue CMD_ATC_INV with vSIDs, and
> > the hypervisor just needs to convert vSID to pSID.
> > 
> > Otherwise vSID doesn't matter because it isn't used in the invalidation
> > API and you are just handling ASIDs that only need the VM_ID scope
> > applied.
> 
> Yea, I was thinking of your point (at the top) how we could
> ensure if an invalidation is targeting a correct vSID. So,
> that narrative was only about CMD_ATC_INV...
> 
> Actually, we don't forward CMD_ATC_INV in QEMU. In another
> thread, Kevin also remarked whether we need to support that
> in the host or not. And I plan to drop CMD_ATC_INV from the
> list of cache_invalidate_user(), following his comments and
> the QEMU situation. Our uAPI, either forwarding the commands
> or a package of queue info, should be able to cover this in
> the future whenever we think it's required.

Something has to generate CMD_ATC_INV.

How do you plan to generate this from the hypervisor based on ASID
invalidations?

The hypervisor doesn't know what ASIDs are connected to what SIDs to
generate the ATC?

Intel is different, they know what devices the vDID is connected to,
so when they get a vDID invalidation they can elaborate it into a ATC
invalidation. ARM doesn't have that information.

Jason
Jason Gunthorpe March 20, 2023, 10:19 p.m. UTC | #37
On Mon, Mar 20, 2023 at 02:22:42PM -0700, Nicolin Chen wrote:
> > > What QEMU does is trapping an SMMU_CMD_CFGI_STE command to send
> > > the host an HWPT alloc ioctl. The former one is based on an SID
> > > or a device, while the latter one is based on ASID.
> > > 
> > > So the correct way should be for QEMU to maintain an ASID-based
> > > list, corresponding to the s1ctxptr from STEs, and only send an
> > > alloc ioctl upon a new s1ctxptr/ASID. Meanwhile, at every trap
> > > of SMMU_CMD_CFGI_STE, it calls a separate ioctl to tie a vSID to
> > > a dev_id (and pSID accordingly).
> > 
> > It is not ASID, it just s1ctxptr's - de-duplicate them.
> 
> SMMU has "ASID" too. And it's one per CD table. It can be also
> seen as one per iommu_domain.

Yes and no, the ASID in ARM is per CDE not per CD table. It is
associated with each TTB0/1 pointer and is effectively the handle for
the IOPTEs.

Every iommu_domain that has a TTB0/1 (ie represents IOPTEs) should
have an ASID.

The "nested" iommu_domains don't represent IOPTEs and don't have ASIDs.

The nested domains are just "shadow STEs".

> > Do something about SMMUv3 not being able to interwork iommu_domains
> > across instances
> 
> I don't follow this one. Device instances?

There is some code that makes sure each iommu_domain is hooked to only
one smmu driver instance, IIRC.
 
> IIUIC, the ioctl for the link of vSID/dev_id should happen at
> the stage when boot boots, while the HWPT alloc ioctl happens
> at CFGI_STE.

Yes
 
> > > What could be a good prototype of the ioctl? Would it be a VFIO
> > > device one or IOMMUFD one?
> > 
> > If we load the vSID table it should be a iommufd one, linked to the
> > ARM SMMUv3 driver and probably take in a pointer to an array of
> > vSID/dev_id pairs. Maybe an add/remove type of operation.
> 
> Will try some solution.

It is only necessary if you want to do batching

For non-batching the SID invalidation should be done differently with
a device_id input instead. That is a bit tricky to organize as you
want iommufd to get back a 'struct device *' from the ID.

Jason
Tian, Kevin March 21, 2023, 8:34 a.m. UTC | #38
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 21, 2023 2:01 AM
> 
> On Mon, Mar 20, 2023 at 09:12:06AM -0700, Nicolin Chen wrote:
> > On Mon, Mar 20, 2023 at 09:59:23AM -0300, Jason Gunthorpe wrote:
> > > On Fri, Mar 17, 2023 at 09:41:34AM +0000, Tian, Kevin wrote:
> > > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > > Sent: Saturday, March 11, 2023 12:20 AM
> > > > >
> > > > > What I'm broadly thinking is if we have to make the infrastructure for
> > > > > VCMDQ HW accelerated invalidation then it is not a big step to also
> > > > > have the kernel SW path use the same infrastructure just with a CPU
> > > > > wake up instead of a MMIO poke.
> > > > >
> > > > > Ie we have a SW version of VCMDQ to speed up SMMUv3 cases
> without HW
> > > > > support.
> > > > >
> > > >
> > > > I thought about this in VT-d context. Looks there are some difficulties.
> > > >
> > > > The most prominent one is that head/tail of the VT-d invalidation
> queue
> > > > are in MMIO registers. Handling it in kernel iommu driver suggests
> > > > reading virtual tail register and updating virtual head register. Kind of
> > > > moving some vIOMMU awareness into the kernel which, iirc, is not
> > > > a welcomed model.
> > >
> > > qemu would trap the MMIO and generate an IOCTL with the written head
> > > pointer. It isn't as efficient as having the kernel do the trap, but
> > > does give batching.
> >
> > Rephrasing that to put into a design: the IOCTL would pass a
> > user pointer to the queue, the size of the queue, then a head
> > pointer and a tail pointer? Then the kernel reads out all the
> > commands between the head and the tail and handles all those
> > invalidation commands only?
> 
> Yes, that is one possible design
> 

If we cannot have the short path in the kernel then I'm not sure the
value of using native format and queue in the uAPI. Batching can
be enabled over any format.

Btw probably a dumb question. The current invalidation IOCTL is
per hwpt. If picking a native format does it suggest making the IOCTL
per iommufd given native format is per IOMMU and could carry
scope bigger than a hwpt.
Jason Gunthorpe March 21, 2023, 11:48 a.m. UTC | #39
On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:

> > > Rephrasing that to put into a design: the IOCTL would pass a
> > > user pointer to the queue, the size of the queue, then a head
> > > pointer and a tail pointer? Then the kernel reads out all the
> > > commands between the head and the tail and handles all those
> > > invalidation commands only?
> > 
> > Yes, that is one possible design
> 
> If we cannot have the short path in the kernel then I'm not sure the
> value of using native format and queue in the uAPI. Batching can
> be enabled over any format.

SMMUv3 will have a hardware short path where the HW itself runs the
VM's command queue and does this logic.

So I like the symmetry of the SW path being close to that.

> Btw probably a dumb question. The current invalidation IOCTL is
> per hwpt. If picking a native format does it suggest making the IOCTL
> per iommufd given native format is per IOMMU and could carry
> scope bigger than a hwpt.

At least on SMMUv3 it depends on what happens with VMID.

If we can tie the VMID to the iommu_domain then the invalidation has
to flow through the iommu_domain to pick up the VMID.

If the VMID is tied to the entire iommufd_ctx then it can flow
independently.

Jason
Nicolin Chen March 22, 2023, 5:14 a.m. UTC | #40
On Mon, Mar 20, 2023 at 07:14:17PM -0300, Jason Gunthorpe wrote:
> On Mon, Mar 20, 2023 at 01:46:52PM -0700, Nicolin Chen wrote:
> > On Mon, Mar 20, 2023 at 03:07:13PM -0300, Jason Gunthorpe wrote:
> > > On Mon, Mar 20, 2023 at 09:35:20AM -0700, Nicolin Chen wrote:
> > > 
> > > > > You need to know what devices the vSID is targetting ang issues
> > > > > invalidations only for those devices.
> > > > 
> > > > I agree with that, yet cannot think of a solution to achieve
> > > > that out of vSID. QEMU code by means of emulating a physical
> > > > SMMU only reads the commands from the queue, without knowing
> > > > which device (vSID) actually sent these commands.
> > > 
> > > Huh?
> > > 
> > > CMD_ATC_INV has the SID
> > > 
> > > Other commands have the ASID.
> > > 
> > > You never need to cross an ASID to a SID or vice versa.
> > > 
> > > If the guest is aware of ATS it will issue CMD_ATC_INV with vSIDs, and
> > > the hypervisor just needs to convert vSID to pSID.
> > > 
> > > Otherwise vSID doesn't matter because it isn't used in the invalidation
> > > API and you are just handling ASIDs that only need the VM_ID scope
> > > applied.
> > 
> > Yea, I was thinking of your point (at the top) how we could
> > ensure if an invalidation is targeting a correct vSID. So,
> > that narrative was only about CMD_ATC_INV...
> > 
> > Actually, we don't forward CMD_ATC_INV in QEMU. In another
> > thread, Kevin also remarked whether we need to support that
> > in the host or not. And I plan to drop CMD_ATC_INV from the
> > list of cache_invalidate_user(), following his comments and
> > the QEMU situation. Our uAPI, either forwarding the commands
> > or a package of queue info, should be able to cover this in
> > the future whenever we think it's required.
> 
> Something has to generate CMD_ATC_INV.
>
> How do you plan to generate this from the hypervisor based on ASID
> invalidations?
>
> The hypervisor doesn't know what ASIDs are connected to what SIDs to
> generate the ATC?
> 
> Intel is different, they know what devices the vDID is connected to,
> so when they get a vDID invalidation they can elaborate it into a ATC
> invalidation. ARM doesn't have that information.

I see. Perhaps vSMMU still needs to forward CMD_ATC_INV. And,
as you suggested, it should go through a vSID sanity check by
the host handler. We can find the corresponding pSID to check
if the device is associated with the iommu_domain?

Thanks
Nic
Nicolin Chen March 22, 2023, 6:42 a.m. UTC | #41
On Tue, Mar 21, 2023 at 08:48:31AM -0300, Jason Gunthorpe wrote:
> On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> 
> > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > user pointer to the queue, the size of the queue, then a head
> > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > commands between the head and the tail and handles all those
> > > > invalidation commands only?
> > > 
> > > Yes, that is one possible design
> > 
> > If we cannot have the short path in the kernel then I'm not sure the
> > value of using native format and queue in the uAPI. Batching can
> > be enabled over any format.
> 
> SMMUv3 will have a hardware short path where the HW itself runs the
> VM's command queue and does this logic.
> 
> So I like the symmetry of the SW path being close to that.

A tricky thing here that I just realized:

With VCMDQ, the guest will have two CMDQs. One is the vSMMU's
CMDQ handling all non-TLBI commands like CMD_CFGI_STE via the
invalidation IOCTL, and the other hardware accelerated VCMDQ
handling all TLBI commands by the HW. In this setup, we will
need a VCMDQ kernel driver to dispatch commands into the two
different queues.

Yet, it feels a bit different with this SW path exposing the
entire SMMU CMDQ, since now theoretically non-TLBI and TLBI
commands can be interlaced in one batch, so the hypervisor
should go through the queue first to handle and delete all
non-TLBI commands, and then forward the CMDQ to the host to
run remaining TLBI commands, if there's any.

> > Btw probably a dumb question. The current invalidation IOCTL is
> > per hwpt. If picking a native format does it suggest making the IOCTL
> > per iommufd given native format is per IOMMU and could carry
> > scope bigger than a hwpt.
> 
> At least on SMMUv3 it depends on what happens with VMID.
> 
> If we can tie the VMID to the iommu_domain then the invalidation has
> to flow through the iommu_domain to pick up the VMID.

Yes. This is what we do now. An invalidation handler finds the
corresponding S2 domain pointer to pick up the VMID. And it'd
be safe, until the S2 domain gets replaced with another domain
I think?

> If the VMID is tied to the entire iommufd_ctx then it can flow
> independently.

One more thing about the VMID unification is that SMMU might
have limitation on the VMID range:
	smmu->vmid_bits = reg & IDR0_VMID16 ? 16 : 8;
	...
	vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);

So, we'd likely need a CAP for that, to apply some limitation
with the iommufd_ctx too?

Thanks
Nic
Jason Gunthorpe March 22, 2023, 12:43 p.m. UTC | #42
On Tue, Mar 21, 2023 at 11:42:25PM -0700, Nicolin Chen wrote:
> On Tue, Mar 21, 2023 at 08:48:31AM -0300, Jason Gunthorpe wrote:
> > On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> > 
> > > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > > user pointer to the queue, the size of the queue, then a head
> > > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > > commands between the head and the tail and handles all those
> > > > > invalidation commands only?
> > > > 
> > > > Yes, that is one possible design
> > > 
> > > If we cannot have the short path in the kernel then I'm not sure the
> > > value of using native format and queue in the uAPI. Batching can
> > > be enabled over any format.
> > 
> > SMMUv3 will have a hardware short path where the HW itself runs the
> > VM's command queue and does this logic.
> > 
> > So I like the symmetry of the SW path being close to that.
> 
> A tricky thing here that I just realized:
> 
> With VCMDQ, the guest will have two CMDQs. One is the vSMMU's
> CMDQ handling all non-TLBI commands like CMD_CFGI_STE via the
> invalidation IOCTL, and the other hardware accelerated VCMDQ
> handling all TLBI commands by the HW. In this setup, we will
> need a VCMDQ kernel driver to dispatch commands into the two
> different queues.

You mean a VM kernel driver? Yes that was always the point, the VM
would use the extra CMDQ's only for invalidation

The main CMDQ would work as today through a trap.

> Yet, it feels a bit different with this SW path exposing the
> entire SMMU CMDQ, since now theoretically non-TLBI and TLBI
> commands can be interlaced in one batch, so the hypervisor
> should go through the queue first to handle and delete all
> non-TLBI commands, and then forward the CMDQ to the host to
> run remaining TLBI commands, if there's any.

Yes, there are a few different ways to handle this and still preserve
batching. It is part of the reason it would be hard to make the kernel
natively parse the commandq

On the other hand, we could add some more native kernel support for a
SW emulated vCMDQ and that might be interesting for performance.

One of the biggest reasons to use nesting is to get to vSVA and
invalidation performance is very important in a vSVA environment. We
should not ignore this in the design.

> > If the VMID is tied to the entire iommufd_ctx then it can flow
> > independently.
> 
> One more thing about the VMID unification is that SMMU might
> have limitation on the VMID range:
> 	smmu->vmid_bits = reg & IDR0_VMID16 ? 16 : 8;
> 	...
> 	vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
> 
> So, we'd likely need a CAP for that, to apply some limitation
> with the iommufd_ctx too?

I'd imagine the driver would have to allocate its internal data
against the iommufd_ctx

I'm not sure how best to organize that if it is the way to go.

Do we have a use case for more than one S2 iommu_domain on ARM?

Jason
Nicolin Chen March 22, 2023, 5:11 p.m. UTC | #43
On Wed, Mar 22, 2023 at 09:43:43AM -0300, Jason Gunthorpe wrote:
> On Tue, Mar 21, 2023 at 11:42:25PM -0700, Nicolin Chen wrote:
> > On Tue, Mar 21, 2023 at 08:48:31AM -0300, Jason Gunthorpe wrote:
> > > On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> > > 
> > > > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > > > user pointer to the queue, the size of the queue, then a head
> > > > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > > > commands between the head and the tail and handles all those
> > > > > > invalidation commands only?
> > > > > 
> > > > > Yes, that is one possible design
> > > > 
> > > > If we cannot have the short path in the kernel then I'm not sure the
> > > > value of using native format and queue in the uAPI. Batching can
> > > > be enabled over any format.
> > > 
> > > SMMUv3 will have a hardware short path where the HW itself runs the
> > > VM's command queue and does this logic.
> > > 
> > > So I like the symmetry of the SW path being close to that.
> > 
> > A tricky thing here that I just realized:
> > 
> > With VCMDQ, the guest will have two CMDQs. One is the vSMMU's
> > CMDQ handling all non-TLBI commands like CMD_CFGI_STE via the
> > invalidation IOCTL, and the other hardware accelerated VCMDQ
> > handling all TLBI commands by the HW. In this setup, we will
> > need a VCMDQ kernel driver to dispatch commands into the two
> > different queues.
> 
> You mean a VM kernel driver? Yes that was always the point, the VM
> would use the extra CMDQ's only for invalidation

Yes, I was saying the guest kernel driver would dispatch the
commands.

> The main CMDQ would work as today through a trap.

Yes.

> > Yet, it feels a bit different with this SW path exposing the
> > entire SMMU CMDQ, since now theoretically non-TLBI and TLBI
> > commands can be interlaced in one batch, so the hypervisor
> > should go through the queue first to handle and delete all
> > non-TLBI commands, and then forward the CMDQ to the host to
> > run remaining TLBI commands, if there's any.
> 
> Yes, there are a few different ways to handle this and still preserve
> batching. It is part of the reason it would be hard to make the kernel
> natively parse the commandq

Yea. I think the way I described above might be the cleanest,
since the host kernel would only handle all the leftover TLBI
commands? I am open for other better idea, if there's any.

> On the other hand, we could add some more native kernel support for a
> SW emulated vCMDQ and that might be interesting for performance.

That's something I have thought about too. But it would feel
like changing the "hardware" of the VM, right? If the host
kernel enables nesting, then we'd have this extra queue for
TLBI commands. From the driver prospective, it would feels
like detecting an extra feature bit in the HW register, but
there's no such bit in the SMMU HW spec :)

Yet, would you please elaborate how it impacts performance?
I can only see the benefit of isolation, from having a SW
emulated VCMDQ exclusively for TLBI commands v.s. having a
single CMDQ interlacing different commands, because both of
them requires trapping and some sort of dispatching.

> One of the biggest reasons to use nesting is to get to vSVA and
> invalidation performance is very important in a vSVA environment. We
> should not ignore this in the design.
> 
> > > If the VMID is tied to the entire iommufd_ctx then it can flow
> > > independently.
> > 
> > One more thing about the VMID unification is that SMMU might
> > have limitation on the VMID range:
> > 	smmu->vmid_bits = reg & IDR0_VMID16 ? 16 : 8;
> > 	...
> > 	vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
> > 
> > So, we'd likely need a CAP for that, to apply some limitation
> > with the iommufd_ctx too?
> 
> I'd imagine the driver would have to allocate its internal data
> against the iommufd_ctx
> 
> I'm not sure how best to organize that if it is the way to go.
> 
> Do we have a use case for more than one S2 iommu_domain on ARM?

In the previous VFIO solution from Eric, a nested iommu_domain
represented an S1+S2 two-stage setup. Since every CMD_CFGI_STE
could trigger an iommu_domain allocation of that, there could
be multiple S2 domains, when we have 2+ passthrough devices.
That's why I had quite a few patch for VMID unification in the
old VCMDQ series.

But now, we have only one S2 domain that works well with multi-
devices. So, I can't really think of a use case that needs two
S2 domains. Yet, I am not very sure.

Btw, just to confirm my understanding, a use case having two
or more iommu_domains means an S2 iommu_domain replacement,
right? I.e. a running S2 iommu_domain gets replaced on the fly
by a different S2 iommu_domain holding a different VMID, while
the IOAS still has the previous mappings? When would that
actually happen in the real world?

Thanks
Nic
Jason Gunthorpe March 22, 2023, 5:28 p.m. UTC | #44
On Wed, Mar 22, 2023 at 10:11:33AM -0700, Nicolin Chen wrote:

> > Yes, there are a few different ways to handle this and still preserve
> > batching. It is part of the reason it would be hard to make the kernel
> > natively parse the commandq
> 
> Yea. I think the way I described above might be the cleanest,
> since the host kernel would only handle all the leftover TLBI
> commands? I am open for other better idea, if there's any.

It seems best to have userspace take a first pass over the cmdq and
then send what it didn't handle to the kernel

> > On the other hand, we could add some more native kernel support for a
> > SW emulated vCMDQ and that might be interesting for performance.
> 
> That's something I have thought about too. But it would feel
> like changing the "hardware" of the VM, right? If the host
> kernel enables nesting, then we'd have this extra queue for
> TLBI commands. From the driver prospective, it would feels
> like detecting an extra feature bit in the HW register, but
> there's no such bit in the SMMU HW spec :)

You'd trigger it the same way vCMDQ triggers. It is basically SW
emulated vCMDQ.

> Yet, would you please elaborate how it impacts performance?
> I can only see the benefit of isolation, from having a SW
> emulated VCMDQ exclusively for TLBI commands v.s. having a
> single CMDQ interlacing different commands, because both of
> them requires trapping and some sort of dispatching.

In theory would could make it work like virtio-iommu where the
doorbell ring for the SW emulated vCMDQ is delivered directly to a
kernel thread and chop a bunch of latency out of it.

The issue is latency to complete invalidation as in a vSVA scenario
the virtual process MM will block on IOMMU invlidation whenever it
does any mm_struct maintenance. Ie you slow a vast set of
operations. The less latency the better.

> Btw, just to confirm my understanding, a use case having two
> or more iommu_domains means an S2 iommu_domain replacement,
> right? I.e. a running S2 iommu_domain gets replaced on the fly
> by a different S2 iommu_domain holding a different VMID, while
> the IOAS still has the previous mappings? When would that
> actually happen in the real world?

It doesn't have to be replace - what is needed is that evey vPCI
device connected to the same SMMU instance be using the same S2 and
thus the same VM_ID.

IOW evey SID must be linked to the same VM_ID or invalidation commands
will not be properly processed.

qemu would have to have multiple SMMU instances according to S2
domains, which is probably true anyhow since we need to know what
physical SMMU instance to deliver the invalidation too anyhow.

Jason
Nicolin Chen March 22, 2023, 7:21 p.m. UTC | #45
On Wed, Mar 22, 2023 at 02:28:38PM -0300, Jason Gunthorpe wrote:
> On Wed, Mar 22, 2023 at 10:11:33AM -0700, Nicolin Chen wrote:
> 
> > > Yes, there are a few different ways to handle this and still preserve
> > > batching. It is part of the reason it would be hard to make the kernel
> > > natively parse the commandq
> > 
> > Yea. I think the way I described above might be the cleanest,
> > since the host kernel would only handle all the leftover TLBI
> > commands? I am open for other better idea, if there's any.
> 
> It seems best to have userspace take a first pass over the cmdq and
> then send what it didn't handle to the kernel

Yes. I can go ahead with this approach for v2.

> > > On the other hand, we could add some more native kernel support for a
> > > SW emulated vCMDQ and that might be interesting for performance.
> > 
> > That's something I have thought about too. But it would feel
> > like changing the "hardware" of the VM, right? If the host
> > kernel enables nesting, then we'd have this extra queue for
> > TLBI commands. From the driver prospective, it would feels
> > like detecting an extra feature bit in the HW register, but
> > there's no such bit in the SMMU HW spec :)
> 
> You'd trigger it the same way vCMDQ triggers. It is basically SW
> emulated vCMDQ.

It still feels something very big. Off the top of my head,
we'd need a pair of new emulated registers for consumer and
producer indexes, and perhaps some configuration registers
too. How should we put into the MMIO space? Maybe we could
emulate that via ECMDQ? So, for QEMU, the SMMU device model
always has the ECMDQ feature so we can have this extra MMIO
space for a separate CMDQ.

> > Yet, would you please elaborate how it impacts performance?
> > I can only see the benefit of isolation, from having a SW
> > emulated VCMDQ exclusively for TLBI commands v.s. having a
> > single CMDQ interlacing different commands, because both of
> > them requires trapping and some sort of dispatching.
> 
> In theory would could make it work like virtio-iommu where the
> doorbell ring for the SW emulated vCMDQ is delivered directly to a
> kernel thread and chop a bunch of latency out of it.

With a SW emulated VCMDQ, the dispatching is moved to the
guest kernel, v.s. the hypervisor. I still don't see a big
improvement here. Perhaps we should run a benchmark with
some experimental changes.

> The issue is latency to complete invalidation as in a vSVA scenario
> the virtual process MM will block on IOMMU invlidation whenever it
> does any mm_struct maintenance. Ie you slow a vast set of
> operations. The less latency the better.

Yea. If it has a noticeable per gain, we should do that.

Do you prefer this to happen with this series? I would think
of adding this in the later stage, although I am not sure if
the uAPI would be completely compatible. It seems to me that
we would need a different uAPI, so as to setup a queue in an
earlier stage, and then to ring a bell when QEMU traps any
incoming commands in the emulated VCMDQ.

> > Btw, just to confirm my understanding, a use case having two
> > or more iommu_domains means an S2 iommu_domain replacement,
> > right? I.e. a running S2 iommu_domain gets replaced on the fly
> > by a different S2 iommu_domain holding a different VMID, while
> > the IOAS still has the previous mappings? When would that
> > actually happen in the real world?
> 
> It doesn't have to be replace - what is needed is that evey vPCI
> device connected to the same SMMU instance be using the same S2 and
> thus the same VM_ID.
> 
> IOW evey SID must be linked to the same VM_ID or invalidation commands
> will not be properly processed.
> 
> qemu would have to have multiple SMMU instances according to S2
> domains, which is probably true anyhow since we need to know what
> physical SMMU instance to deliver the invalidation too anyhow.

I am not 100% following this part. So, you mean that we're
safe if we only have one SMMU instance, because there'd be
only one S2 domain, while multiple S2 domains would happen
if we have multiple SMMU instances?

Can we still use the same S2 domain for multiple instances?
Our approach of setting up a stage-2 mapping in QEMU is to
map the entire guest memory. I don't see a point in having
a separate S2 domain, even if there are multiple instances?

Btw, from a private discussion with Eric, he expressed the
difficulty of adding multiple SMMU instances in QEMU, as it
would complicate the device and ACPI components. For VCMDQ,
we do need a multi-instance environment, because there are
multiple physical pairs of SMMU+VCMDQ, i.e. multiple VCMDQ
MMIO regions being attached/used by different devices. So,
I have been exploring a different approach by creating an
internal multiplication inside VCMDQ...

Thanks
Nic
Jason Gunthorpe March 22, 2023, 7:41 p.m. UTC | #46
On Wed, Mar 22, 2023 at 12:21:27PM -0700, Nicolin Chen wrote:

> Do you prefer this to happen with this series? 

No, I just don't want to exclude doing it someday if people are
interested to optimize this. As I said in the other thread I'd rather
optimize SMMUv3 emulation than try to use virtio-iommu to make it run
faster.

> the uAPI would be completely compatible. It seems to me that
> we would need a different uAPI, so as to setup a queue in an
> earlier stage, and then to ring a bell when QEMU traps any
> incoming commands in the emulated VCMDQ.

Yes, it would need more uAPI. Lets just make sure there is room and
maybe think a bit about what it would look like.

You should also draft through the HW vCMDQ stuff to ensure it fits
in here nicely.

 
> > > Btw, just to confirm my understanding, a use case having two
> > > or more iommu_domains means an S2 iommu_domain replacement,
> > > right? I.e. a running S2 iommu_domain gets replaced on the fly
> > > by a different S2 iommu_domain holding a different VMID, while
> > > the IOAS still has the previous mappings? When would that
> > > actually happen in the real world?
> > 
> > It doesn't have to be replace - what is needed is that evey vPCI
> > device connected to the same SMMU instance be using the same S2 and
> > thus the same VM_ID.
> > 
> > IOW evey SID must be linked to the same VM_ID or invalidation commands
> > will not be properly processed.
> > 
> > qemu would have to have multiple SMMU instances according to S2
> > domains, which is probably true anyhow since we need to know what
> > physical SMMU instance to deliver the invalidation too anyhow.
> 
> I am not 100% following this part. So, you mean that we're
> safe if we only have one SMMU instance, because there'd be
> only one S2 domain, while multiple S2 domains would happen
> if we have multiple SMMU instances?

Yes, that would happen today, especially since each smmu has its own
vm_id allocator IIRC
 
> Can we still use the same S2 domain for multiple instances?

I think not today.

At the core, if we share the same S2 domain then it is a problem to
figure out what smmu instance to send the invalidation command too. EG
if the userspace invalidates ASID 1 you'd have to replicate
invalidation to all SMMU instances. Even if ASID 1 is used by only a
single SID/STE that has a single SMMU instance backing it.

So I think for ARM we want to reflect the physical SMMU instances into
vSMMU instances and that feels best done by having a unique S2
iommu_domain for each SMMU instance. Then we know that an invalidation
for a SMMU instance is delivered to that S2's singular CMDQ and things
like vCMDQ become possible.

> Our approach of setting up a stage-2 mapping in QEMU is to
> map the entire guest memory. I don't see a point in having
> a separate S2 domain, even if there are multiple instances?

And then this is the drawback, we don't really want to have duplicated
S2 page tables in the system for every stage 2.

Maybe we have made a mistake by allowing the S2 to be an unmanaged
domain. Perhaps we should create the S2 out of an unmanaged domain
like the S1.

Then the rules could be
 - Unmanaged domain can be used with every smmu instance, only one
   copy of the page table. The ASID in the iommu_domain is
   kernel-global
 - S2 domain is a child of a shared unmanaged domain. It can be used
   only with the SMMU it is associated with, it has a per-SMMU VM ID
 - S1 domain is a child of a S2 domain, it can be used only with the
   SMMU it's S2 is associated with, just because

> Btw, from a private discussion with Eric, he expressed the
> difficulty of adding multiple SMMU instances in QEMU, as it
> would complicate the device and ACPI components. 

I'm not surprised by this, but for efficiency we probably have to do
this. Eric am I wrong?

qemu shouldn't have to do it immediately, but the kernel uAPI should
allow for a VMM that is optimized. We shouldn't exclude this by
mis-designing the kernel uAPI. qemu can replicate the invalidations
itself to make an ineffecient single vSMMU.

> For VCMDQ, we do need a multi-instance environment, because there
> are multiple physical pairs of SMMU+VCMDQ, i.e. multiple VCMDQ MMIO
> regions being attached/used by different devices. 

Yes. IMHO vCMDQ is the sane design here - invalidation performance is
important, having a kernel-bypass way to do it is ideal. I understand
AMD has a similar kernel-bypass queue approach for their stuff too. I
think everyone will eventually need to do this, especially for CC
applications. Having the hypervisor able to interfere with
invalidation feels like an attack vector.

So we should focus on long term designs that allow kernel-bypass to
work, and I don't see way to hide multi-instance and still truely
support vCMDQ??

> So, I have been exploring a different approach by creating an
> internal multiplication inside VCMDQ...

How can that work?

You'd have to have the guest VM to know to replicate to different
vCMDQ's? Which isn't the standard SMMU programming model anymore..

Jason
Nicolin Chen March 22, 2023, 8:43 p.m. UTC | #47
On Wed, Mar 22, 2023 at 04:41:32PM -0300, Jason Gunthorpe wrote:
> On Wed, Mar 22, 2023 at 12:21:27PM -0700, Nicolin Chen wrote:
> 
> > Do you prefer this to happen with this series? 
> 
> No, I just don't want to exclude doing it someday if people are
> interested to optimize this. As I said in the other thread I'd rather
> optimize SMMUv3 emulation than try to use virtio-iommu to make it run
> faster.

Got it. I will then just focus on reworking the invalidation
data structure with a list of command queue info.

> > the uAPI would be completely compatible. It seems to me that
> > we would need a different uAPI, so as to setup a queue in an
> > earlier stage, and then to ring a bell when QEMU traps any
> > incoming commands in the emulated VCMDQ.
> 
> Yes, it would need more uAPI. Lets just make sure there is room and
> maybe think a bit about what it would look like.
> 
> You should also draft through the HW vCMDQ stuff to ensure it fits
> in here nicely.

Yes.
  
> > > > Btw, just to confirm my understanding, a use case having two
> > > > or more iommu_domains means an S2 iommu_domain replacement,
> > > > right? I.e. a running S2 iommu_domain gets replaced on the fly
> > > > by a different S2 iommu_domain holding a different VMID, while
> > > > the IOAS still has the previous mappings? When would that
> > > > actually happen in the real world?
> > > 
> > > It doesn't have to be replace - what is needed is that evey vPCI
> > > device connected to the same SMMU instance be using the same S2 and
> > > thus the same VM_ID.
> > > 
> > > IOW evey SID must be linked to the same VM_ID or invalidation commands
> > > will not be properly processed.
> > > 
> > > qemu would have to have multiple SMMU instances according to S2
> > > domains, which is probably true anyhow since we need to know what
> > > physical SMMU instance to deliver the invalidation too anyhow.
> > 
> > I am not 100% following this part. So, you mean that we're
> > safe if we only have one SMMU instance, because there'd be
> > only one S2 domain, while multiple S2 domains would happen
> > if we have multiple SMMU instances?
> 
> Yes, that would happen today, especially since each smmu has its own
> vm_id allocator IIRC
>  
> > Can we still use the same S2 domain for multiple instances?
> 
> I think not today.
> 
> At the core, if we share the same S2 domain then it is a problem to
> figure out what smmu instance to send the invalidation command too. EG
> if the userspace invalidates ASID 1 you'd have to replicate
> invalidation to all SMMU instances. Even if ASID 1 is used by only a
> single SID/STE that has a single SMMU instance backing it.

Oh, Right. That would be a perf drawdown from an unnecessary
IOTLB miss potentially, because with a single instance QEMU
has to broadcast that invalidation to all SMMU instances.

> So I think for ARM we want to reflect the physical SMMU instances into
> vSMMU instances and that feels best done by having a unique S2
> iommu_domain for each SMMU instance. Then we know that an invalidation
> for a SMMU instance is delivered to that S2's singular CMDQ and things
> like vCMDQ become possible.

In that environment, do we still need a VMID unification?
 
> > Our approach of setting up a stage-2 mapping in QEMU is to
> > map the entire guest memory. I don't see a point in having
> > a separate S2 domain, even if there are multiple instances?
> 
> And then this is the drawback, we don't really want to have duplicated
> S2 page tables in the system for every stage 2.
> 
> Maybe we have made a mistake by allowing the S2 to be an unmanaged
> domain. Perhaps we should create the S2 out of an unmanaged domain
> like the S1.
> 
> Then the rules could be
>  - Unmanaged domain can be used with every smmu instance, only one
>    copy of the page table. The ASID in the iommu_domain is
>    kernel-global
>  - S2 domain is a child of a shared unmanaged domain. It can be used
>    only with the SMMU it is associated with, it has a per-SMMU VM ID
>  - S1 domain is a child of a S2 domain, it can be used only with the
>    SMMU it's S2 is associated with, just because

The actual S2 pagetable has to stay at the unmanaged domain
for IOAS_MAP, while we maintain an s2_cfg data structure in
the shadow S2 domain per SMMU instance that has its own VMID
but a shared S2 page table pointer?

Hmm... Feels very complicated to me. How does that help?

> > Btw, from a private discussion with Eric, he expressed the
> > difficulty of adding multiple SMMU instances in QEMU, as it
> > would complicate the device and ACPI components. 
> 
> I'm not surprised by this, but for efficiency we probably have to do
> this. Eric am I wrong?
> 
> qemu shouldn't have to do it immediately, but the kernel uAPI should
> allow for a VMM that is optimized. We shouldn't exclude this by
> mis-designing the kernel uAPI. qemu can replicate the invalidations
> itself to make an ineffecient single vSMMU.
> 
> > For VCMDQ, we do need a multi-instance environment, because there
> > are multiple physical pairs of SMMU+VCMDQ, i.e. multiple VCMDQ MMIO
> > regions being attached/used by different devices. 
> 
> Yes. IMHO vCMDQ is the sane design here - invalidation performance is
> important, having a kernel-bypass way to do it is ideal. I understand
> AMD has a similar kernel-bypass queue approach for their stuff too. I
> think everyone will eventually need to do this, especially for CC
> applications. Having the hypervisor able to interfere with
> invalidation feels like an attack vector.
> 
> So we should focus on long term designs that allow kernel-bypass to
> work, and I don't see way to hide multi-instance and still truely
> support vCMDQ??

Well, I agree and hope people across the board decide to move
towards the multi-instance direction.

> > So, I have been exploring a different approach by creating an
> > internal multiplication inside VCMDQ...
> 
> How can that work?
> 
> You'd have to have the guest VM to know to replicate to different
> vCMDQ's? Which isn't the standard SMMU programming model anymore..

VCMDQ has multiple VINTFs (Virtual Interfaces) that's supposed
to be used by the host to expose to multiple VMs.

In a multi-SMMU environment, every single SMMU+VCMDQ instance
would have one VINTF only that contains one or more VCMDQs. In
this case, passthrough devices behind different physical SMMU
instances are straightforwardly attached to different vSMMUs.

However, if we can't have multiple vSMMU instances, the guest
VM (its HW) would enable multiple VINTFs corresponding to the
number of physical SMMU/VCMDQ instances, for devices to attach
accordingly. That means I need to figure out a way to pin the
devices onto those VINTFs, by somehow passing their physical
SMMU IDs. The latest progress that I made is to have a bit of
a hack in the Dsdt table by inserting a physical SMMU ID to
every single passthrough device node, though I still need to
confirm the legality of doing that...

Thanks
Nic
Nicolin Chen March 22, 2023, 8:57 p.m. UTC | #48
On Mon, Mar 20, 2023 at 07:19:34PM -0300, Jason Gunthorpe wrote:

> > IIUIC, the ioctl for the link of vSID/dev_id should happen at
> > the stage when boot boots, while the HWPT alloc ioctl happens
> > at CFGI_STE.
> 
> Yes
>  
> > > > What could be a good prototype of the ioctl? Would it be a VFIO
> > > > device one or IOMMUFD one?
> > > 
> > > If we load the vSID table it should be a iommufd one, linked to the
> > > ARM SMMUv3 driver and probably take in a pointer to an array of
> > > vSID/dev_id pairs. Maybe an add/remove type of operation.
> > 
> > Will try some solution.
> 
> It is only necessary if you want to do batching
> 
> For non-batching the SID invalidation should be done differently with
> a device_id input instead. That is a bit tricky to organize as you
> want iommufd to get back a 'struct device *' from the ID.

I am wondering whether we need to have dev_id, i.e. IOMMUFD,
in play with the link of pSID<->vSID, as I am thinking of a
simplified approach by passing the vSID via the hwpt alloc
structure when we allocate an S2 domain.

The arm_smmu_domain_alloc_user() takes this vSID and a dev
pointer, so it can easily tie the vSID to the dev's pSID.

By doing so, we wouldn't need a new ioctl anymore.

Thanks
Nic
Jason Gunthorpe March 23, 2023, 12:16 p.m. UTC | #49
On Wed, Mar 22, 2023 at 01:43:59PM -0700, Nicolin Chen wrote:

> > So I think for ARM we want to reflect the physical SMMU instances into
> > vSMMU instances and that feels best done by having a unique S2
> > iommu_domain for each SMMU instance. Then we know that an invalidation
> > for a SMMU instance is delivered to that S2's singular CMDQ and things
> > like vCMDQ become possible.
> 
> In that environment, do we still need a VMID unification?

If each S2 is per-smmu-instance then the VMID can be local to the SMMU
instance

> > > Our approach of setting up a stage-2 mapping in QEMU is to
> > > map the entire guest memory. I don't see a point in having
> > > a separate S2 domain, even if there are multiple instances?
> > 
> > And then this is the drawback, we don't really want to have duplicated
> > S2 page tables in the system for every stage 2.
> > 
> > Maybe we have made a mistake by allowing the S2 to be an unmanaged
> > domain. Perhaps we should create the S2 out of an unmanaged domain
> > like the S1.
> > 
> > Then the rules could be
> >  - Unmanaged domain can be used with every smmu instance, only one
> >    copy of the page table. The ASID in the iommu_domain is
> >    kernel-global
> >  - S2 domain is a child of a shared unmanaged domain. It can be used
> >    only with the SMMU it is associated with, it has a per-SMMU VM ID
> >  - S1 domain is a child of a S2 domain, it can be used only with the
> >    SMMU it's S2 is associated with, just because
> 
> The actual S2 pagetable has to stay at the unmanaged domain
> for IOAS_MAP, while we maintain an s2_cfg data structure in
> the shadow S2 domain per SMMU instance that has its own VMID
> but a shared S2 page table pointer?

Yes

> Hmm... Feels very complicated to me. How does that help?

It de-duplicates the page table across multiple SMMU instances.

> > So, I have been exploring a different approach by creating an
> > > internal multiplication inside VCMDQ...
> > 
> > How can that work?
> > 
> > You'd have to have the guest VM to know to replicate to different
> > vCMDQ's? Which isn't the standard SMMU programming model anymore..
> 
> VCMDQ has multiple VINTFs (Virtual Interfaces) that's supposed
> to be used by the host to expose to multiple VMs.
> 
> In a multi-SMMU environment, every single SMMU+VCMDQ instance
> would have one VINTF only that contains one or more VCMDQs. In
> this case, passthrough devices behind different physical SMMU
> instances are straightforwardly attached to different vSMMUs.

Yes, this is the obvious simple impementation

> However, if we can't have multiple vSMMU instances, the guest
> VM (its HW) would enable multiple VINTFs corresponding to the
> number of physical SMMU/VCMDQ instances, for devices to attach
> accordingly. That means I need to figure out a way to pin the
> devices onto those VINTFs, by somehow passing their physical
> SMMU IDs. 

And a way to request the correctly bound vCMDQ from the guest as well.
Sounds really messsy, I'd think multi-smmu is the much cleaner choice

Jason
Jason Gunthorpe March 23, 2023, 12:17 p.m. UTC | #50
On Wed, Mar 22, 2023 at 01:57:23PM -0700, Nicolin Chen wrote:
> On Mon, Mar 20, 2023 at 07:19:34PM -0300, Jason Gunthorpe wrote:
> 
> > > IIUIC, the ioctl for the link of vSID/dev_id should happen at
> > > the stage when boot boots, while the HWPT alloc ioctl happens
> > > at CFGI_STE.
> > 
> > Yes
> >  
> > > > > What could be a good prototype of the ioctl? Would it be a VFIO
> > > > > device one or IOMMUFD one?
> > > > 
> > > > If we load the vSID table it should be a iommufd one, linked to the
> > > > ARM SMMUv3 driver and probably take in a pointer to an array of
> > > > vSID/dev_id pairs. Maybe an add/remove type of operation.
> > > 
> > > Will try some solution.
> > 
> > It is only necessary if you want to do batching
> > 
> > For non-batching the SID invalidation should be done differently with
> > a device_id input instead. That is a bit tricky to organize as you
> > want iommufd to get back a 'struct device *' from the ID.
> 
> I am wondering whether we need to have dev_id, i.e. IOMMUFD,
> in play with the link of pSID<->vSID, as I am thinking of a
> simplified approach by passing the vSID via the hwpt alloc
> structure when we allocate an S2 domain.

No, that doesn't make sense. the vSID is per-STE, the S2 domain is
fully shared. You can't put SID information in the iommu_domains.

JAson
Nicolin Chen March 23, 2023, 6:13 p.m. UTC | #51
On Thu, Mar 23, 2023 at 09:16:51AM -0300, Jason Gunthorpe wrote:
> On Wed, Mar 22, 2023 at 01:43:59PM -0700, Nicolin Chen wrote:
> 
> > > So I think for ARM we want to reflect the physical SMMU instances into
> > > vSMMU instances and that feels best done by having a unique S2
> > > iommu_domain for each SMMU instance. Then we know that an invalidation
> > > for a SMMU instance is delivered to that S2's singular CMDQ and things
> > > like vCMDQ become possible.
> > 
> > In that environment, do we still need a VMID unification?
> 
> If each S2 is per-smmu-instance then the VMID can be local to the SMMU
> instance

It sounds like related to the multi-SMMU instance too? Anyway,
it's good to think we that have a way out from requiring this
VMID unification.

> > > > Our approach of setting up a stage-2 mapping in QEMU is to
> > > > map the entire guest memory. I don't see a point in having
> > > > a separate S2 domain, even if there are multiple instances?
> > > 
> > > And then this is the drawback, we don't really want to have duplicated
> > > S2 page tables in the system for every stage 2.
> > > 
> > > Maybe we have made a mistake by allowing the S2 to be an unmanaged
> > > domain. Perhaps we should create the S2 out of an unmanaged domain
> > > like the S1.
> > > 
> > > Then the rules could be
> > >  - Unmanaged domain can be used with every smmu instance, only one
> > >    copy of the page table. The ASID in the iommu_domain is
> > >    kernel-global
> > >  - S2 domain is a child of a shared unmanaged domain. It can be used
> > >    only with the SMMU it is associated with, it has a per-SMMU VM ID
> > >  - S1 domain is a child of a S2 domain, it can be used only with the
> > >    SMMU it's S2 is associated with, just because
> > 
> > The actual S2 pagetable has to stay at the unmanaged domain
> > for IOAS_MAP, while we maintain an s2_cfg data structure in
> > the shadow S2 domain per SMMU instance that has its own VMID
> > but a shared S2 page table pointer?
> 
> Yes
> 
> > Hmm... Feels very complicated to me. How does that help?
> 
> It de-duplicates the page table across multiple SMMU instances.

Oh. So that the s2_cfg data structures can have a shared S2
IOPT while having different VMIDs. This would be a big rework.
It changes the two-domain design for nesting. Should we do
this at a later stage when supporting multi-SMMU instance or
now? And I am not sure Intel would need this...

> > > So, I have been exploring a different approach by creating an
> > > > internal multiplication inside VCMDQ...
> > > 
> > > How can that work?
> > > 
> > > You'd have to have the guest VM to know to replicate to different
> > > vCMDQ's? Which isn't the standard SMMU programming model anymore..
> > 
> > VCMDQ has multiple VINTFs (Virtual Interfaces) that's supposed
> > to be used by the host to expose to multiple VMs.
> > 
> > In a multi-SMMU environment, every single SMMU+VCMDQ instance
> > would have one VINTF only that contains one or more VCMDQs. In
> > this case, passthrough devices behind different physical SMMU
> > instances are straightforwardly attached to different vSMMUs.
> 
> Yes, this is the obvious simple impementation
> 
> > However, if we can't have multiple vSMMU instances, the guest
> > VM (its HW) would enable multiple VINTFs corresponding to the
> > number of physical SMMU/VCMDQ instances, for devices to attach
> > accordingly. That means I need to figure out a way to pin the
> > devices onto those VINTFs, by somehow passing their physical
> > SMMU IDs. 
> 
> And a way to request the correctly bound vCMDQ from the guest as well.
> Sounds really messsy, I'd think multi-smmu is the much cleaner choice

Yes. I agree, we would need the entire QEMU community to give
consent to change that though.

Thanks!
Nicolin
Jason Gunthorpe March 23, 2023, 6:27 p.m. UTC | #52
On Thu, Mar 23, 2023 at 11:13:48AM -0700, Nicolin Chen wrote:
> Oh. So that the s2_cfg data structures can have a shared S2
> IOPT while having different VMIDs. This would be a big rework.
> It changes the two-domain design for nesting. Should we do
> this at a later stage when supporting multi-SMMU instance or
> now? And I am not sure Intel would need this...

If we do nothing right now then the S2 unmanaged iommu_domain will
carry the vm_id and it will be locked to a single SMMU instance.

To support multi-instance HW qemu would have to duplicate the entire
S2 unmanaged domain to get different vm_ids.

This is basically status-quo today because SMMU already doesn't
support sharing the unmanaged iommu_domain between instances.

If we chart a path to using a dedicated S2 domain then qemu side would
have to change to make a normal HWPT to back the S2 and then create a
real S2 as a child.

This implies that the request for S2 has to be in the driver data
today so that the driver knows if it should enable the unamanged
domain for S2 operation and lock it do an instance.

So long as that is OK we are probably OK to be incremental..

> > And a way to request the correctly bound vCMDQ from the guest as well.
> > Sounds really messsy, I'd think multi-smmu is the much cleaner choice
> 
> Yes. I agree, we would need the entire QEMU community to give
> consent to change that though.

I suppose it wasn't consent, it was someone needs to do the difficult
work.

Jason
Tian, Kevin March 24, 2023, 8:47 a.m. UTC | #53
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 21, 2023 7:49 PM
> 
> On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> 
> > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > user pointer to the queue, the size of the queue, then a head
> > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > commands between the head and the tail and handles all those
> > > > invalidation commands only?
> > >
> > > Yes, that is one possible design
> >
> > If we cannot have the short path in the kernel then I'm not sure the
> > value of using native format and queue in the uAPI. Batching can
> > be enabled over any format.
> 
> SMMUv3 will have a hardware short path where the HW itself runs the
> VM's command queue and does this logic.
> 
> So I like the symmetry of the SW path being close to that.
> 

Out of curiosity. VCMDQ is per SMMU. Does it imply that Qemu needs
to create multiple vSMMU instances if devices assigned to it are behind
different physical SMMUs (plus one instance specific for emulated
devices), to match VCMDQ with a specific device?

btw is VCMDQ in standard SMMU spec or a NVIDIA specific extension?
If the latter does it require extra changes in guest smmu driver?

The symmetry of the SW path has another merit beyond performance.
It allows live migration falling back to the sw short-path with not-so-bad
overhead when the dest machine cannot afford the same number of
VCMDQ's as the src.

But still the main open for in-kernel short-path is what would be the
framework to move part of vIOMMU emulation into the kernel. If this
can be done cleanly then it's better than vhost-iommu which lacks
behind significantly regarding to advanced features. But if it cannot
be done cleanly leaving each vendor move random emulation logic
into the kernel then vhost-iommu sounds more friendly to the kernel
 though lots of work remains to fill the feature gap.
Tian, Kevin March 24, 2023, 8:55 a.m. UTC | #54
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, March 22, 2023 1:15 PM
> 
> >
> > Something has to generate CMD_ATC_INV.
> >
> > How do you plan to generate this from the hypervisor based on ASID
> > invalidations?
> >
> > The hypervisor doesn't know what ASIDs are connected to what SIDs to
> > generate the ATC?
> >
> > Intel is different, they know what devices the vDID is connected to,
> > so when they get a vDID invalidation they can elaborate it into a ATC
> > invalidation. ARM doesn't have that information.
> 
> I see. Perhaps vSMMU still needs to forward CMD_ATC_INV. And,

Ah that's quite a different story. 
Tian, Kevin March 24, 2023, 9:02 a.m. UTC | #55
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, March 22, 2023 2:42 PM
> 
> On Tue, Mar 21, 2023 at 08:48:31AM -0300, Jason Gunthorpe wrote:
> > On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> >
> > > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > > user pointer to the queue, the size of the queue, then a head
> > > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > > commands between the head and the tail and handles all those
> > > > > invalidation commands only?
> > > >
> > > > Yes, that is one possible design
> > >
> > > If we cannot have the short path in the kernel then I'm not sure the
> > > value of using native format and queue in the uAPI. Batching can
> > > be enabled over any format.
> >
> > SMMUv3 will have a hardware short path where the HW itself runs the
> > VM's command queue and does this logic.
> >
> > So I like the symmetry of the SW path being close to that.
> 
> A tricky thing here that I just realized:
> 
> With VCMDQ, the guest will have two CMDQs. One is the vSMMU's
> CMDQ handling all non-TLBI commands like CMD_CFGI_STE via the
> invalidation IOCTL, and the other hardware accelerated VCMDQ
> handling all TLBI commands by the HW. In this setup, we will
> need a VCMDQ kernel driver to dispatch commands into the two
> different queues.
> 

why doesn't hw generate a vm-exit for unsupported CMDs in VCMDQ
and then let them emulated by vSMMU? such events should be rare
once map/unmap are being conducted...
Jason Gunthorpe March 24, 2023, 2:44 p.m. UTC | #56
On Fri, Mar 24, 2023 at 08:47:20AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, March 21, 2023 7:49 PM
> > 
> > On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> > 
> > > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > > user pointer to the queue, the size of the queue, then a head
> > > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > > commands between the head and the tail and handles all those
> > > > > invalidation commands only?
> > > >
> > > > Yes, that is one possible design
> > >
> > > If we cannot have the short path in the kernel then I'm not sure the
> > > value of using native format and queue in the uAPI. Batching can
> > > be enabled over any format.
> > 
> > SMMUv3 will have a hardware short path where the HW itself runs the
> > VM's command queue and does this logic.
> > 
> > So I like the symmetry of the SW path being close to that.
> > 
> 
> Out of curiosity. VCMDQ is per SMMU. Does it imply that Qemu needs
> to create multiple vSMMU instances if devices assigned to it are behind
> different physical SMMUs (plus one instance specific for emulated
> devices), to match VCMDQ with a specific device?

Yes

> btw is VCMDQ in standard SMMU spec or a NVIDIA specific extension?
> If the latter does it require extra changes in guest smmu driver?

It is a mash up of ARM standard ECMDQ with a few additions. I hope ARM
will standardize something someday

> The symmetry of the SW path has another merit beyond performance.
> It allows live migration falling back to the sw short-path with not-so-bad
> overhead when the dest machine cannot afford the same number of
> VCMDQ's as the src.

Well, that requires SW emulation of the VCMDQ thing, but yes
 
> But still the main open for in-kernel short-path is what would be the
> framework to move part of vIOMMU emulation into the kernel. If this
> can be done cleanly then it's better than vhost-iommu which lacks
> behind significantly regarding to advanced features. But if it cannot
> be done cleanly leaving each vendor move random emulation logic
> into the kernel then vhost-iommu sounds more friendly to the kernel
>  though lots of work remains to fill the feature gap.

I assume there are reasonable ways to hook the kernel to kvm, vhost
does it. I've never looked at it. At worst we need to factor some of
the vhost code into some library to allow it.

We want a kernel thread to wakeup on a doorbell ring basically.

Jason
Jason Gunthorpe March 24, 2023, 2:57 p.m. UTC | #57
On Fri, Mar 24, 2023 at 09:02:34AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Wednesday, March 22, 2023 2:42 PM
> > 
> > On Tue, Mar 21, 2023 at 08:48:31AM -0300, Jason Gunthorpe wrote:
> > > On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> > >
> > > > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > > > user pointer to the queue, the size of the queue, then a head
> > > > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > > > commands between the head and the tail and handles all those
> > > > > > invalidation commands only?
> > > > >
> > > > > Yes, that is one possible design
> > > >
> > > > If we cannot have the short path in the kernel then I'm not sure the
> > > > value of using native format and queue in the uAPI. Batching can
> > > > be enabled over any format.
> > >
> > > SMMUv3 will have a hardware short path where the HW itself runs the
> > > VM's command queue and does this logic.
> > >
> > > So I like the symmetry of the SW path being close to that.
> > 
> > A tricky thing here that I just realized:
> > 
> > With VCMDQ, the guest will have two CMDQs. One is the vSMMU's
> > CMDQ handling all non-TLBI commands like CMD_CFGI_STE via the
> > invalidation IOCTL, and the other hardware accelerated VCMDQ
> > handling all TLBI commands by the HW. In this setup, we will
> > need a VCMDQ kernel driver to dispatch commands into the two
> > different queues.
> > 
> 
> why doesn't hw generate a vm-exit for unsupported CMDs in VCMDQ
> and then let them emulated by vSMMU? such events should be rare
> once map/unmap are being conducted...

IIRC vcmdq is defined to only process invalidations, so it would be a
driver error to send anything else. I think this is what Nicolin
means.  Most likely to use it the VM would have to see the nvidia acpi
extension and activate vcmdq in the VM.

If you suggest to overlay the main cmdq with the vcmdq and then don't
tell the guest about it.. Robin suggested something similar.

This idea would be a half and half, the HW would run the queue and the
doorbell and generate error interrupts back to the hypervisor and tell
it that the queue is paused and ask it to fix the failed entry and
restart.

I could see this as an interesting solution, but I don't know if this
HW can support it..

Jason
Nicolin Chen March 24, 2023, 5:35 p.m. UTC | #58
On Fri, Mar 24, 2023 at 11:57:09AM -0300, Jason Gunthorpe wrote:
> On Fri, Mar 24, 2023 at 09:02:34AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Wednesday, March 22, 2023 2:42 PM
> > > 
> > > On Tue, Mar 21, 2023 at 08:48:31AM -0300, Jason Gunthorpe wrote:
> > > > On Tue, Mar 21, 2023 at 08:34:00AM +0000, Tian, Kevin wrote:
> > > >
> > > > > > > Rephrasing that to put into a design: the IOCTL would pass a
> > > > > > > user pointer to the queue, the size of the queue, then a head
> > > > > > > pointer and a tail pointer? Then the kernel reads out all the
> > > > > > > commands between the head and the tail and handles all those
> > > > > > > invalidation commands only?
> > > > > >
> > > > > > Yes, that is one possible design
> > > > >
> > > > > If we cannot have the short path in the kernel then I'm not sure the
> > > > > value of using native format and queue in the uAPI. Batching can
> > > > > be enabled over any format.
> > > >
> > > > SMMUv3 will have a hardware short path where the HW itself runs the
> > > > VM's command queue and does this logic.
> > > >
> > > > So I like the symmetry of the SW path being close to that.
> > > 
> > > A tricky thing here that I just realized:
> > > 
> > > With VCMDQ, the guest will have two CMDQs. One is the vSMMU's
> > > CMDQ handling all non-TLBI commands like CMD_CFGI_STE via the
> > > invalidation IOCTL, and the other hardware accelerated VCMDQ
> > > handling all TLBI commands by the HW. In this setup, we will
> > > need a VCMDQ kernel driver to dispatch commands into the two
> > > different queues.
> > > 
> > 
> > why doesn't hw generate a vm-exit for unsupported CMDs in VCMDQ
> > and then let them emulated by vSMMU? such events should be rare
> > once map/unmap are being conducted...
> 
> IIRC vcmdq is defined to only process invalidations, so it would be a
> driver error to send anything else. I think this is what Nicolin
> means.  Most likely to use it the VM would have to see the nvidia acpi
> extension and activate vcmdq in the VM.
> 
> If you suggest to overlay the main cmdq with the vcmdq and then don't
> tell the guest about it.. Robin suggested something similar.

Yea, I remember that too, from the email that I received from
Robin on Christmas Eve :)

Yet, I haven't got a chance to run some experiment with that.

> This idea would be a half and half, the HW would run the queue and the
> doorbell and generate error interrupts back to the hypervisor and tell
> it that the queue is paused and ask it to fix the failed entry and
> restart.
>
> I could see this as an interesting solution, but I don't know if this
> HW can support it..

It possibly can, since an unsupported command will trigger an
Illegal Command interrupt, then the IRQ handler could read it
out of the CMDQ. Again, I'd need to run some experiment, once
this SMMU nesting series is settled down to certain level.

One immediate thing about this solution is that we still need
a multi-CMDQ support per SMMU instance, besides from a multi-
SMMU instance support. This might be implemented as the ECMDQ
I guess. But I am not sure if there is a ECMDQ HW available,
so that we can add its support first, to fit VCMDQ into it.

Overall, interesting topics! I'd like to carry on along the
way of this series, hoping we can figure out something smart
and solid to implement :)

Thanks
Nicolin
Tian, Kevin March 28, 2023, 2:48 a.m. UTC | #59
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Friday, March 24, 2023 10:45 PM
> 
> > But still the main open for in-kernel short-path is what would be the
> > framework to move part of vIOMMU emulation into the kernel. If this
> > can be done cleanly then it's better than vhost-iommu which lacks
> > behind significantly regarding to advanced features. But if it cannot
> > be done cleanly leaving each vendor move random emulation logic
> > into the kernel then vhost-iommu sounds more friendly to the kernel
> >  though lots of work remains to fill the feature gap.
> 
> I assume there are reasonable ways to hook the kernel to kvm, vhost
> does it. I've never looked at it. At worst we need to factor some of
> the vhost code into some library to allow it.
> 
> We want a kernel thread to wakeup on a doorbell ring basically.
> 

kvm supports ioeventfd for the doorbell purpose.

Aside from that I'm not sure which part of vhost can be generalized
to be used by other vIOMMU. it's a in-memory ring structure plus
doorbell so it's easy to fit in the kernel.

But emulated vIOMMUs are typically MMIO-based ring structure
which requires 1) kvm provides a synchronous ioeventfd for MMIO
based head/tail emulation; 2) userspace vIOMMU shares its virtual
register page with the kernel which can then update virtual tail/head
registers w/o exiting to the userspace; 3) the kernel thread can
selectively exit to userspace for cmds which it cannot directly handle.

Those require a new framework to establish.
Tian, Kevin March 28, 2023, 3:03 a.m. UTC | #60
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, March 25, 2023 1:35 AM
> 
> On Fri, Mar 24, 2023 at 11:57:09AM -0300, Jason Gunthorpe wrote:
> >
> > If you suggest to overlay the main cmdq with the vcmdq and then don't
> > tell the guest about it.. Robin suggested something similar.

yes, that's my point.

> 
> Yea, I remember that too, from the email that I received from
> Robin on Christmas Eve :)
> 
> Yet, I haven't got a chance to run some experiment with that.
> 
> > This idea would be a half and half, the HW would run the queue and the
> > doorbell and generate error interrupts back to the hypervisor and tell
> > it that the queue is paused and ask it to fix the failed entry and
> > restart.
> >
> > I could see this as an interesting solution, but I don't know if this
> > HW can support it..
> 
> It possibly can, since an unsupported command will trigger an
> Illegal Command interrupt, then the IRQ handler could read it
> out of the CMDQ. Again, I'd need to run some experiment, once
> this SMMU nesting series is settled down to certain level.
> 

also you want to ensure that error is a recoverable type so
once sw fixes it the hw can continue to behave correctly.
Jason Gunthorpe March 28, 2023, 12:26 p.m. UTC | #61
On Tue, Mar 28, 2023 at 02:48:31AM +0000, Tian, Kevin wrote:

> But emulated vIOMMUs are typically MMIO-based ring structure
> which requires 1) kvm provides a synchronous ioeventfd for MMIO
> based head/tail emulation; 2) userspace vIOMMU shares its virtual
> register page with the kernel which can then update virtual tail/head
> registers w/o exiting to the userspace; 3) the kernel thread can
> selectively exit to userspace for cmds which it cannot directly handle.

What is needed is for the kvm side to capture the store execute it to
some backing memory, and also trigger the eventfd.

It shouldn't need to be synchronous.

For SMMU the interface is layed out with unique 4k pages per-CMDQ that
contains the 3 relevant 8 byte values.

So we could mmap a page from the kernel that has the 3 values. qemu
would install the page in the kvm memory map and it would 
arrange things so that stores reach the 8 bytes and trigger an
eventfd.

Kernel simply reads the cons index after the eventfd, looks in the
IOAS to get the queue memory and does the operation async.

It is not especially conceptually difficult..

Jason
Tian, Kevin March 31, 2023, 8:09 a.m. UTC | #62
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 28, 2023 8:27 PM
> 
> On Tue, Mar 28, 2023 at 02:48:31AM +0000, Tian, Kevin wrote:
> 
> > But emulated vIOMMUs are typically MMIO-based ring structure
> > which requires 1) kvm provides a synchronous ioeventfd for MMIO
> > based head/tail emulation; 2) userspace vIOMMU shares its virtual
> > register page with the kernel which can then update virtual tail/head
> > registers w/o exiting to the userspace; 3) the kernel thread can
> > selectively exit to userspace for cmds which it cannot directly handle.
> 
> What is needed is for the kvm side to capture the store execute it to
> some backing memory, and also trigger the eventfd.
> 
> It shouldn't need to be synchronous.

Correct

> 
> For SMMU the interface is layed out with unique 4k pages per-CMDQ that
> contains the 3 relevant 8 byte values.

VT-d has only one invalidation queue with relevant registers mixed
with other VT-d registers in 4k page. But this should be fine as long
as the new mechanism allows specifying which offsets in mapped
page fall into the fast path.

> 
> So we could mmap a page from the kernel that has the 3 values. qemu
> would install the page in the kvm memory map and it would
> arrange things so that stores reach the 8 bytes and trigger an
> eventfd.
> 
> Kernel simply reads the cons index after the eventfd, looks in the
> IOAS to get the queue memory and does the operation async.
> 
> It is not especially conceptually difficult..
> 

Looks so, at least in concept.

btw regarding to the initial nesting support on smmu do you want
to follow this unique 4k layout plus native cmdq format or just
the latter (i.e. cmd format is native but head/tail/start is defined
in a sw customized way)?

If the latter I wonder whether it's necessary to generalize it so
the batching format is vendor-agnostic while the specific cmd/
descriptor format is vendor specific.

Thanks
Kevin
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ac63185ae268..7d73eab5e7f4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2880,9 +2880,65 @@  static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
 	arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
 }
 
+static void arm_smmu_cache_invalidate_user(struct iommu_domain *domain,
+					   void *user_data)
+{
+	struct iommu_hwpt_invalidate_arm_smmuv3 *inv_info = user_data;
+	struct arm_smmu_cmdq_ent cmd = { .opcode = inv_info->opcode };
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	size_t granule_size = inv_info->granule_size;
+	unsigned long iova = 0;
+	size_t size = 0;
+	int ssid = 0;
+
+	if (!smmu || !smmu_domain->s2 || domain->type != IOMMU_DOMAIN_NESTED)
+		return;
+
+	switch (inv_info->opcode) {
+	case CMDQ_OP_CFGI_CD:
+	case CMDQ_OP_CFGI_CD_ALL:
+		return arm_smmu_sync_cd(smmu_domain, inv_info->ssid, true);
+	case CMDQ_OP_TLBI_NH_VA:
+		cmd.tlbi.asid = inv_info->asid;
+		fallthrough;
+	case CMDQ_OP_TLBI_NH_VAA:
+		if (!granule_size || !(granule_size & smmu->pgsize_bitmap) ||
+		    granule_size & ~(1ULL << __ffs(granule_size)))
+			return;
+
+		iova = inv_info->range.start;
+		size = inv_info->range.last - inv_info->range.start + 1;
+		if (!size)
+			return;
+
+		cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
+		cmd.tlbi.leaf = inv_info->flags & IOMMU_SMMUV3_CMDQ_TLBI_VA_LEAF;
+		__arm_smmu_tlb_inv_range(&cmd, iova, size, granule_size, smmu_domain);
+		break;
+	case CMDQ_OP_TLBI_NH_ASID:
+		cmd.tlbi.asid = inv_info->asid;
+		fallthrough;
+	case CMDQ_OP_TLBI_NH_ALL:
+		cmd.tlbi.vmid = smmu_domain->s2->s2_cfg.vmid;
+		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
+		break;
+	case CMDQ_OP_ATC_INV:
+		ssid = inv_info->ssid;
+		iova = inv_info->range.start;
+		size = inv_info->range.last - inv_info->range.start + 1;
+		break;
+	default:
+		return;
+	}
+
+	arm_smmu_atc_inv_domain(smmu_domain, ssid, iova, size);
+}
+
 static const struct iommu_domain_ops arm_smmu_nested_domain_ops = {
 	.attach_dev		= arm_smmu_attach_dev,
 	.free			= arm_smmu_domain_free,
+	.cache_invalidate_user	= arm_smmu_cache_invalidate_user,
 };
 
 static struct iommu_domain *