diff mbox series

[v4,13/21] IOMMU/x86: prefill newly allocate page tables

Message ID 9d073a05-0c7d-4989-7a38-93cd5b01d071@suse.com (mailing list archive)
State Superseded
Headers show
Series IOMMU: superpage support when not sharing pagetables | expand

Commit Message

Jan Beulich April 25, 2022, 8:40 a.m. UTC
Page tables are used for two purposes after allocation: They either
start out all empty, or they get filled to replace a superpage.
Subsequently, to replace all empty or fully contiguous page tables,
contiguous sub-regions will be recorded within individual page tables.
Install the initial set of markers immediately after allocation. Make
sure to retain these markers when further populating a page table in
preparation for it to replace a superpage.

The markers are simply 4-bit fields holding the order value of
contiguous entries. To demonstrate this, if a page table had just 16
entries, this would be the initial (fully contiguous) set of markers:

index  0 1 2 3 4 5 6 7 8 9 A B C D E F
marker 4 0 1 0 2 0 1 0 3 0 1 0 2 0 1 0

"Contiguous" here means not only present entries with successively
increasing MFNs, each one suitably aligned for its slot, but also a
respective number of all non-present entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
An alternative to the ASSERT()s added to set_iommu_ptes_present() would
be to make the function less general-purpose; it's used in a single
place only after all (i.e. it might as well be folded into its only
caller).

While in VT-d's comment ahead of struct dma_pte I'm adjusting the
description of the high bits, I'd like to note that the description of
some of the lower bits isn't correct either. Yet I don't think adjusting
that belongs here.
---
v4: Add another comment referring to pt-contig-markers.h. Re-base.
v3: Add comments. Re-base.
v2: New.

Comments

Roger Pau Monne May 6, 2022, 11:16 a.m. UTC | #1
On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
> Page tables are used for two purposes after allocation: They either
> start out all empty, or they get filled to replace a superpage.
> Subsequently, to replace all empty or fully contiguous page tables,
> contiguous sub-regions will be recorded within individual page tables.
> Install the initial set of markers immediately after allocation. Make
> sure to retain these markers when further populating a page table in
> preparation for it to replace a superpage.
> 
> The markers are simply 4-bit fields holding the order value of
> contiguous entries. To demonstrate this, if a page table had just 16
> entries, this would be the initial (fully contiguous) set of markers:
> 
> index  0 1 2 3 4 5 6 7 8 9 A B C D E F
> marker 4 0 1 0 2 0 1 0 3 0 1 0 2 0 1 0
> 
> "Contiguous" here means not only present entries with successively
> increasing MFNs, each one suitably aligned for its slot, but also a
> respective number of all non-present entries.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> ---
> An alternative to the ASSERT()s added to set_iommu_ptes_present() would
> be to make the function less general-purpose; it's used in a single
> place only after all (i.e. it might as well be folded into its only
> caller).

I would think adding a comment that the function requires the PDE to
be empty would be good.  Also given the current usage we could drop
the nr_ptes parameter and just name the function fill_pde() or
similar.

> 
> While in VT-d's comment ahead of struct dma_pte I'm adjusting the
> description of the high bits, I'd like to note that the description of
> some of the lower bits isn't correct either. Yet I don't think adjusting
> that belongs here.
> ---
> v4: Add another comment referring to pt-contig-markers.h. Re-base.
> v3: Add comments. Re-base.
> v2: New.
> 
> --- a/xen/arch/x86/include/asm/iommu.h
> +++ b/xen/arch/x86/include/asm/iommu.h
> @@ -146,7 +146,8 @@ void iommu_free_domid(domid_t domid, uns
>  
>  int __must_check iommu_free_pgtables(struct domain *d);
>  struct domain_iommu;
> -struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
> +struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd,
> +                                                   uint64_t contig_mask);
>  void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
>  
>  #endif /* !__ARCH_X86_IOMMU_H__ */
> --- a/xen/drivers/passthrough/amd/iommu-defs.h
> +++ b/xen/drivers/passthrough/amd/iommu-defs.h
> @@ -446,11 +446,13 @@ union amd_iommu_x2apic_control {
>  #define IOMMU_PAGE_TABLE_U32_PER_ENTRY	(IOMMU_PAGE_TABLE_ENTRY_SIZE / 4)
>  #define IOMMU_PAGE_TABLE_ALIGNMENT	4096
>  
> +#define IOMMU_PTE_CONTIG_MASK           0x1e /* The ign0 field below. */
> +
>  union amd_iommu_pte {
>      uint64_t raw;
>      struct {
>          bool pr:1;
> -        unsigned int ign0:4;
> +        unsigned int ign0:4; /* Covered by IOMMU_PTE_CONTIG_MASK. */
>          bool a:1;
>          bool d:1;
>          unsigned int ign1:2;
> --- a/xen/drivers/passthrough/amd/iommu_map.c
> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
>  
>      while ( nr_ptes-- )
>      {
> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
> +        ASSERT(!pde->next_level);
> +        ASSERT(!pde->u);
> +
> +        if ( pde > table )
> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
> +        else
> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);

I think PAGETABLE_ORDER would be clearer here.

While here, could you also assert that next_mfn matches the contiguous
order currently set in the PTE?

> +
> +        pde->iw = iw;
> +        pde->ir = ir;
> +        pde->fc = true; /* See set_iommu_pde_present(). */
> +        pde->mfn = next_mfn;
> +        pde->pr = true;
>  
>          ++pde;
>          next_mfn += page_sz;
> @@ -295,7 +307,7 @@ static int iommu_pde_from_dfn(struct dom
>              mfn = next_table_mfn;
>  
>              /* allocate lower level page table */
> -            table = iommu_alloc_pgtable(hd);
> +            table = iommu_alloc_pgtable(hd, IOMMU_PTE_CONTIG_MASK);
>              if ( table == NULL )
>              {
>                  AMD_IOMMU_ERROR("cannot allocate I/O page table\n");
> @@ -325,7 +337,7 @@ static int iommu_pde_from_dfn(struct dom
>  
>              if ( next_table_mfn == 0 )
>              {
> -                table = iommu_alloc_pgtable(hd);
> +                table = iommu_alloc_pgtable(hd, IOMMU_PTE_CONTIG_MASK);
>                  if ( table == NULL )
>                  {
>                      AMD_IOMMU_ERROR("cannot allocate I/O page table\n");
> @@ -717,7 +729,7 @@ static int fill_qpt(union amd_iommu_pte
>                   * page table pages, and the resulting allocations are always
>                   * zeroed.
>                   */
> -                pgs[level] = iommu_alloc_pgtable(hd);
> +                pgs[level] = iommu_alloc_pgtable(hd, 0);

Is it worth not setting up the contiguous data for quarantine page
tables?

I think it's fine now given the current code, but you having added
ASSERTs that the contig data is correct in set_iommu_ptes_present()
makes me wonder whether we could trigger those in the future.

I understand that the contig data is not helpful for quarantine page
tables, but still doesn't seem bad to have it just for coherency.

>                  if ( !pgs[level] )
>                  {
>                      rc = -ENOMEM;
> @@ -775,7 +787,7 @@ int cf_check amd_iommu_quarantine_init(s
>          return 0;
>      }
>  
> -    pdev->arch.amd.root_table = iommu_alloc_pgtable(hd);
> +    pdev->arch.amd.root_table = iommu_alloc_pgtable(hd, 0);
>      if ( !pdev->arch.amd.root_table )
>          return -ENOMEM;
>  
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -342,7 +342,7 @@ int amd_iommu_alloc_root(struct domain *
>  
>      if ( unlikely(!hd->arch.amd.root_table) && d != dom_io )
>      {
> -        hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
> +        hd->arch.amd.root_table = iommu_alloc_pgtable(hd, 0);
>          if ( !hd->arch.amd.root_table )
>              return -ENOMEM;
>      }
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -334,7 +334,7 @@ static uint64_t addr_to_dma_page_maddr(s
>              goto out;
>  
>          pte_maddr = level;
> -        if ( !(pg = iommu_alloc_pgtable(hd)) )
> +        if ( !(pg = iommu_alloc_pgtable(hd, 0)) )
>              goto out;
>  
>          hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
> @@ -376,7 +376,7 @@ static uint64_t addr_to_dma_page_maddr(s
>              }
>  
>              pte_maddr = level - 1;
> -            pg = iommu_alloc_pgtable(hd);
> +            pg = iommu_alloc_pgtable(hd, DMA_PTE_CONTIG_MASK);
>              if ( !pg )
>                  break;
>  
> @@ -388,12 +388,13 @@ static uint64_t addr_to_dma_page_maddr(s
>                  struct dma_pte *split = map_vtd_domain_page(pte_maddr);
>                  unsigned long inc = 1UL << level_to_offset_bits(level - 1);
>  
> -                split[0].val = pte->val;
> +                split[0].val |= pte->val & ~DMA_PTE_CONTIG_MASK;
>                  if ( inc == PAGE_SIZE )
>                      split[0].val &= ~DMA_PTE_SP;
>  
>                  for ( offset = 1; offset < PTE_NUM; ++offset )
> -                    split[offset].val = split[offset - 1].val + inc;
> +                    split[offset].val |=
> +                        (split[offset - 1].val & ~DMA_PTE_CONTIG_MASK) + inc;
>  
>                  iommu_sync_cache(split, PAGE_SIZE);
>                  unmap_vtd_domain_page(split);
> @@ -2173,7 +2174,7 @@ static int __must_check cf_check intel_i
>      if ( iommu_snoop )
>          dma_set_pte_snp(new);
>  
> -    if ( old.val == new.val )
> +    if ( !((old.val ^ new.val) & ~DMA_PTE_CONTIG_MASK) )
>      {
>          spin_unlock(&hd->arch.mapping_lock);
>          unmap_vtd_domain_page(page);
> @@ -3052,7 +3053,7 @@ static int fill_qpt(struct dma_pte *this
>                   * page table pages, and the resulting allocations are always
>                   * zeroed.
>                   */
> -                pgs[level] = iommu_alloc_pgtable(hd);
> +                pgs[level] = iommu_alloc_pgtable(hd, 0);
>                  if ( !pgs[level] )
>                  {
>                      rc = -ENOMEM;
> @@ -3109,7 +3110,7 @@ static int cf_check intel_iommu_quaranti
>      if ( !drhd )
>          return -ENODEV;
>  
> -    pg = iommu_alloc_pgtable(hd);
> +    pg = iommu_alloc_pgtable(hd, 0);
>      if ( !pg )
>          return -ENOMEM;
>  
> --- a/xen/drivers/passthrough/vtd/iommu.h
> +++ b/xen/drivers/passthrough/vtd/iommu.h
> @@ -253,7 +253,10 @@ struct context_entry {
>   * 2-6: reserved
>   * 7: super page
>   * 8-11: available
> - * 12-63: Host physcial address
> + * 12-51: Host physcial address
> + * 52-61: available (52-55 used for DMA_PTE_CONTIG_MASK)
> + * 62: reserved
> + * 63: available
>   */
>  struct dma_pte {
>      u64 val;
> @@ -263,6 +266,7 @@ struct dma_pte {
>  #define DMA_PTE_PROT (DMA_PTE_READ | DMA_PTE_WRITE)
>  #define DMA_PTE_SP   (1 << 7)
>  #define DMA_PTE_SNP  (1 << 11)
> +#define DMA_PTE_CONTIG_MASK  (0xfull << PADDR_BITS)
>  #define dma_clear_pte(p)    do {(p).val = 0;} while(0)
>  #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0)
>  #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0)
> @@ -276,7 +280,7 @@ struct dma_pte {
>  #define dma_pte_write(p) (dma_pte_prot(p) & DMA_PTE_WRITE)
>  #define dma_pte_addr(p) ((p).val & PADDR_MASK & PAGE_MASK_4K)
>  #define dma_set_pte_addr(p, addr) do {\
> -            (p).val |= ((addr) & PAGE_MASK_4K); } while (0)
> +            (p).val |= ((addr) & PADDR_MASK & PAGE_MASK_4K); } while (0)

While I'm not opposed to this, I would assume that addr is not
expected to contain bit cleared by PADDR_MASK? (or PAGE_MASK_4K FWIW)

Or else callers are really messed up.

>  #define dma_pte_present(p) (((p).val & DMA_PTE_PROT) != 0)
>  #define dma_pte_superpage(p) (((p).val & DMA_PTE_SP) != 0)
>  
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -522,11 +522,12 @@ int iommu_free_pgtables(struct domain *d
>      return 0;
>  }
>  
> -struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd)
> +struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd,
> +                                      uint64_t contig_mask)
>  {
>      unsigned int memflags = 0;
>      struct page_info *pg;
> -    void *p;
> +    uint64_t *p;
>  
>  #ifdef CONFIG_NUMA
>      if ( hd->node != NUMA_NO_NODE )
> @@ -538,7 +539,29 @@ struct page_info *iommu_alloc_pgtable(st
>          return NULL;
>  
>      p = __map_domain_page(pg);
> -    clear_page(p);
> +
> +    if ( contig_mask )
> +    {
> +        /* See pt-contig-markers.h for a description of the marker scheme. */
> +        unsigned int i, shift = find_first_set_bit(contig_mask);
> +
> +        ASSERT(((PAGE_SHIFT - 3) & (contig_mask >> shift)) == PAGE_SHIFT - 3);

I think it might be clearer to use PAGETABLE_ORDER rather than
PAGE_SHIFT - 3.

Thanks, Roger.
Jan Beulich May 19, 2022, 12:12 p.m. UTC | #2
On 06.05.2022 13:16, Roger Pau Monné wrote:
> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
>> ---
>> An alternative to the ASSERT()s added to set_iommu_ptes_present() would
>> be to make the function less general-purpose; it's used in a single
>> place only after all (i.e. it might as well be folded into its only
>> caller).
> 
> I would think adding a comment that the function requires the PDE to
> be empty would be good.

But that's not the case - what the function expects to be clear is
what is being ASSERT()ed.

>  Also given the current usage we could drop
> the nr_ptes parameter and just name the function fill_pde() or
> similar.

Right, but that would want to be a separate change.

>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
>>  
>>      while ( nr_ptes-- )
>>      {
>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
>> +        ASSERT(!pde->next_level);
>> +        ASSERT(!pde->u);
>> +
>> +        if ( pde > table )
>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
>> +        else
>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
> 
> I think PAGETABLE_ORDER would be clearer here.

I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
in IOMMU code afaics.

> While here, could you also assert that next_mfn matches the contiguous
> order currently set in the PTE?

I can, yet that wouldn't be here, but outside (ahead) of the loop.

>> @@ -717,7 +729,7 @@ static int fill_qpt(union amd_iommu_pte
>>                   * page table pages, and the resulting allocations are always
>>                   * zeroed.
>>                   */
>> -                pgs[level] = iommu_alloc_pgtable(hd);
>> +                pgs[level] = iommu_alloc_pgtable(hd, 0);
> 
> Is it worth not setting up the contiguous data for quarantine page
> tables?

Well, it's (slightly) less code, and (hopefully) faster due to the use
of clear_page().

> I think it's fine now given the current code, but you having added
> ASSERTs that the contig data is correct in set_iommu_ptes_present()
> makes me wonder whether we could trigger those in the future.

I'd like to deal with that if and when needed.

> I understand that the contig data is not helpful for quarantine page
> tables, but still doesn't seem bad to have it just for coherency.

You realize that the markers all being zero in a table is a valid
state, functionality-wise? It would merely mean no re-coalescing
until respective entries were touched (updated) at least once.

>> @@ -276,7 +280,7 @@ struct dma_pte {
>>  #define dma_pte_write(p) (dma_pte_prot(p) & DMA_PTE_WRITE)
>>  #define dma_pte_addr(p) ((p).val & PADDR_MASK & PAGE_MASK_4K)
>>  #define dma_set_pte_addr(p, addr) do {\
>> -            (p).val |= ((addr) & PAGE_MASK_4K); } while (0)
>> +            (p).val |= ((addr) & PADDR_MASK & PAGE_MASK_4K); } while (0)
> 
> While I'm not opposed to this, I would assume that addr is not
> expected to contain bit cleared by PADDR_MASK? (or PAGE_MASK_4K FWIW)

Indeed. But I'd prefer to be on the safe side, now that some of the
bits have gained a different meaning.

>> @@ -538,7 +539,29 @@ struct page_info *iommu_alloc_pgtable(st
>>          return NULL;
>>  
>>      p = __map_domain_page(pg);
>> -    clear_page(p);
>> +
>> +    if ( contig_mask )
>> +    {
>> +        /* See pt-contig-markers.h for a description of the marker scheme. */
>> +        unsigned int i, shift = find_first_set_bit(contig_mask);
>> +
>> +        ASSERT(((PAGE_SHIFT - 3) & (contig_mask >> shift)) == PAGE_SHIFT - 3);
> 
> I think it might be clearer to use PAGETABLE_ORDER rather than
> PAGE_SHIFT - 3.

See above.

Jan
Roger Pau Monne May 20, 2022, 10:47 a.m. UTC | #3
On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
> On 06.05.2022 13:16, Roger Pau Monné wrote:
> > On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
> >> ---
> >> An alternative to the ASSERT()s added to set_iommu_ptes_present() would
> >> be to make the function less general-purpose; it's used in a single
> >> place only after all (i.e. it might as well be folded into its only
> >> caller).
> > 
> > I would think adding a comment that the function requires the PDE to
> > be empty would be good.
> 
> But that's not the case - what the function expects to be clear is
> what is being ASSERT()ed.
> 
> >  Also given the current usage we could drop
> > the nr_ptes parameter and just name the function fill_pde() or
> > similar.
> 
> Right, but that would want to be a separate change.
> 
> >> --- a/xen/drivers/passthrough/amd/iommu_map.c
> >> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> >> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
> >>  
> >>      while ( nr_ptes-- )
> >>      {
> >> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
> >> +        ASSERT(!pde->next_level);
> >> +        ASSERT(!pde->u);
> >> +
> >> +        if ( pde > table )
> >> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
> >> +        else
> >> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
> > 
> > I think PAGETABLE_ORDER would be clearer here.
> 
> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
> in IOMMU code afaics.

Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
sure what's the rule for declaring that PAGE_SHIFT is fine to use in
IOMMU code  but not PAGETABLE_ORDER.

Thanks, Roger.
Jan Beulich May 20, 2022, 11:11 a.m. UTC | #4
On 20.05.2022 12:47, Roger Pau Monné wrote:
> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
>> On 06.05.2022 13:16, Roger Pau Monné wrote:
>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
>>>>  
>>>>      while ( nr_ptes-- )
>>>>      {
>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
>>>> +        ASSERT(!pde->next_level);
>>>> +        ASSERT(!pde->u);
>>>> +
>>>> +        if ( pde > table )
>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
>>>> +        else
>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
>>>
>>> I think PAGETABLE_ORDER would be clearer here.
>>
>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
>> in IOMMU code afaics.
> 
> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
> IOMMU code  but not PAGETABLE_ORDER.

Hmm, yes and no. But for consistency with other IOMMU code I may want
to switch to PAGE_SHIFT_4K.

Jan
Jan Beulich May 20, 2022, 11:13 a.m. UTC | #5
On 20.05.2022 13:11, Jan Beulich wrote:
> On 20.05.2022 12:47, Roger Pau Monné wrote:
>> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
>>> On 06.05.2022 13:16, Roger Pau Monné wrote:
>>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
>>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
>>>>>  
>>>>>      while ( nr_ptes-- )
>>>>>      {
>>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
>>>>> +        ASSERT(!pde->next_level);
>>>>> +        ASSERT(!pde->u);
>>>>> +
>>>>> +        if ( pde > table )
>>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
>>>>> +        else
>>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
>>>>
>>>> I think PAGETABLE_ORDER would be clearer here.
>>>
>>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
>>> in IOMMU code afaics.
>>
>> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
>> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
>> IOMMU code  but not PAGETABLE_ORDER.
> 
> Hmm, yes and no. But for consistency with other IOMMU code I may want
> to switch to PAGE_SHIFT_4K.

Except that, with the plan to re-use pt_update_contig_markers() for CPU-
side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.

Jan
Roger Pau Monne May 20, 2022, 12:22 p.m. UTC | #6
On Fri, May 20, 2022 at 01:13:28PM +0200, Jan Beulich wrote:
> On 20.05.2022 13:11, Jan Beulich wrote:
> > On 20.05.2022 12:47, Roger Pau Monné wrote:
> >> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
> >>> On 06.05.2022 13:16, Roger Pau Monné wrote:
> >>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
> >>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
> >>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> >>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
> >>>>>  
> >>>>>      while ( nr_ptes-- )
> >>>>>      {
> >>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
> >>>>> +        ASSERT(!pde->next_level);
> >>>>> +        ASSERT(!pde->u);
> >>>>> +
> >>>>> +        if ( pde > table )
> >>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
> >>>>> +        else
> >>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
> >>>>
> >>>> I think PAGETABLE_ORDER would be clearer here.
> >>>
> >>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
> >>> in IOMMU code afaics.
> >>
> >> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
> >> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
> >> IOMMU code  but not PAGETABLE_ORDER.
> > 
> > Hmm, yes and no. But for consistency with other IOMMU code I may want
> > to switch to PAGE_SHIFT_4K.
> 
> Except that, with the plan to re-use pt_update_contig_markers() for CPU-
> side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.

Then can PAGETABLE_ORDER be used instead of PAGE_SHIFT - 3?

IMO it makes the code quite easier to understand.

Thanks, Roger.
Jan Beulich May 20, 2022, 12:36 p.m. UTC | #7
On 20.05.2022 14:22, Roger Pau Monné wrote:
> On Fri, May 20, 2022 at 01:13:28PM +0200, Jan Beulich wrote:
>> On 20.05.2022 13:11, Jan Beulich wrote:
>>> On 20.05.2022 12:47, Roger Pau Monné wrote:
>>>> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
>>>>> On 06.05.2022 13:16, Roger Pau Monné wrote:
>>>>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
>>>>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>>>>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>>>>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
>>>>>>>  
>>>>>>>      while ( nr_ptes-- )
>>>>>>>      {
>>>>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
>>>>>>> +        ASSERT(!pde->next_level);
>>>>>>> +        ASSERT(!pde->u);
>>>>>>> +
>>>>>>> +        if ( pde > table )
>>>>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
>>>>>>> +        else
>>>>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
>>>>>>
>>>>>> I think PAGETABLE_ORDER would be clearer here.
>>>>>
>>>>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
>>>>> in IOMMU code afaics.
>>>>
>>>> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
>>>> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
>>>> IOMMU code  but not PAGETABLE_ORDER.
>>>
>>> Hmm, yes and no. But for consistency with other IOMMU code I may want
>>> to switch to PAGE_SHIFT_4K.
>>
>> Except that, with the plan to re-use pt_update_contig_markers() for CPU-
>> side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.
> 
> Then can PAGETABLE_ORDER be used instead of PAGE_SHIFT - 3?

pt_update_contig_markers() isn't IOMMU code; since I've said I'd switch
to PAGE_SHIFT_4K in IOMMU code I'm having a hard time seeing how I could
at the same time start using PAGETABLE_ORDER there.

What I maybe could do is use PTE_PER_TABLE_SHIFT in AMD code and
LEVEL_STRIDE in VT-d one. Yet I'm not sure that would be fully correct/
consistent, ...

> IMO it makes the code quite easier to understand.

... or in fact helping readability.

Jan
Roger Pau Monne May 20, 2022, 2:28 p.m. UTC | #8
On Fri, May 20, 2022 at 02:36:02PM +0200, Jan Beulich wrote:
> On 20.05.2022 14:22, Roger Pau Monné wrote:
> > On Fri, May 20, 2022 at 01:13:28PM +0200, Jan Beulich wrote:
> >> On 20.05.2022 13:11, Jan Beulich wrote:
> >>> On 20.05.2022 12:47, Roger Pau Monné wrote:
> >>>> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
> >>>>> On 06.05.2022 13:16, Roger Pau Monné wrote:
> >>>>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
> >>>>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
> >>>>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> >>>>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
> >>>>>>>  
> >>>>>>>      while ( nr_ptes-- )
> >>>>>>>      {
> >>>>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
> >>>>>>> +        ASSERT(!pde->next_level);
> >>>>>>> +        ASSERT(!pde->u);
> >>>>>>> +
> >>>>>>> +        if ( pde > table )
> >>>>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
> >>>>>>> +        else
> >>>>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
> >>>>>>
> >>>>>> I think PAGETABLE_ORDER would be clearer here.
> >>>>>
> >>>>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
> >>>>> in IOMMU code afaics.
> >>>>
> >>>> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
> >>>> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
> >>>> IOMMU code  but not PAGETABLE_ORDER.
> >>>
> >>> Hmm, yes and no. But for consistency with other IOMMU code I may want
> >>> to switch to PAGE_SHIFT_4K.
> >>
> >> Except that, with the plan to re-use pt_update_contig_markers() for CPU-
> >> side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.
> > 
> > Then can PAGETABLE_ORDER be used instead of PAGE_SHIFT - 3?
> 
> pt_update_contig_markers() isn't IOMMU code; since I've said I'd switch
> to PAGE_SHIFT_4K in IOMMU code I'm having a hard time seeing how I could
> at the same time start using PAGETABLE_ORDER there.

I've got confused by the double reply and read it as if you where
going to stick to using PAGE_SHIFT everywhere as proposed originally.

> What I maybe could do is use PTE_PER_TABLE_SHIFT in AMD code and
> LEVEL_STRIDE in VT-d one. Yet I'm not sure that would be fully correct/
> consistent, ...
> 
> > IMO it makes the code quite easier to understand.
> 
> ... or in fact helping readability.

Looking at pt_update_contig_markers() we hardcode CONTIG_LEVEL_SHIFT
to 9 there, which means all users must have a page table order of 9.

It seems to me we are just making things more complicated than
necessary by trying to avoid dependencies between CPU and IOMMU
page-table sizes and definitions, when the underlying mechanism to set
->ign0 has those assumptions baked in.

Would it help if you introduced a PAGE_TABLE_ORDER in page-defs.h?

Thanks, Roger.
Roger Pau Monne May 20, 2022, 2:38 p.m. UTC | #9
On Fri, May 20, 2022 at 04:28:14PM +0200, Roger Pau Monné wrote:
> On Fri, May 20, 2022 at 02:36:02PM +0200, Jan Beulich wrote:
> > On 20.05.2022 14:22, Roger Pau Monné wrote:
> > > On Fri, May 20, 2022 at 01:13:28PM +0200, Jan Beulich wrote:
> > >> On 20.05.2022 13:11, Jan Beulich wrote:
> > >>> On 20.05.2022 12:47, Roger Pau Monné wrote:
> > >>>> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
> > >>>>> On 06.05.2022 13:16, Roger Pau Monné wrote:
> > >>>>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
> > >>>>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
> > >>>>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> > >>>>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
> > >>>>>>>  
> > >>>>>>>      while ( nr_ptes-- )
> > >>>>>>>      {
> > >>>>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
> > >>>>>>> +        ASSERT(!pde->next_level);
> > >>>>>>> +        ASSERT(!pde->u);
> > >>>>>>> +
> > >>>>>>> +        if ( pde > table )
> > >>>>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
> > >>>>>>> +        else
> > >>>>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
> > >>>>>>
> > >>>>>> I think PAGETABLE_ORDER would be clearer here.
> > >>>>>
> > >>>>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
> > >>>>> in IOMMU code afaics.
> > >>>>
> > >>>> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
> > >>>> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
> > >>>> IOMMU code  but not PAGETABLE_ORDER.
> > >>>
> > >>> Hmm, yes and no. But for consistency with other IOMMU code I may want
> > >>> to switch to PAGE_SHIFT_4K.
> > >>
> > >> Except that, with the plan to re-use pt_update_contig_markers() for CPU-
> > >> side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.
> > > 
> > > Then can PAGETABLE_ORDER be used instead of PAGE_SHIFT - 3?
> > 
> > pt_update_contig_markers() isn't IOMMU code; since I've said I'd switch
> > to PAGE_SHIFT_4K in IOMMU code I'm having a hard time seeing how I could
> > at the same time start using PAGETABLE_ORDER there.
> 
> I've got confused by the double reply and read it as if you where
> going to stick to using PAGE_SHIFT everywhere as proposed originally.
> 
> > What I maybe could do is use PTE_PER_TABLE_SHIFT in AMD code and
> > LEVEL_STRIDE in VT-d one. Yet I'm not sure that would be fully correct/
> > consistent, ...
> > 
> > > IMO it makes the code quite easier to understand.
> > 
> > ... or in fact helping readability.
> 
> Looking at pt_update_contig_markers() we hardcode CONTIG_LEVEL_SHIFT
> to 9 there, which means all users must have a page table order of 9.
> 
> It seems to me we are just making things more complicated than
> necessary by trying to avoid dependencies between CPU and IOMMU
> page-table sizes and definitions, when the underlying mechanism to set
> ->ign0 has those assumptions baked in.
> 
> Would it help if you introduced a PAGE_TABLE_ORDER in page-defs.h?

Sorry, should be PAGE_TABLE_ORDER_4K.

Roger.
Jan Beulich May 23, 2022, 6:49 a.m. UTC | #10
On 20.05.2022 16:38, Roger Pau Monné wrote:
> On Fri, May 20, 2022 at 04:28:14PM +0200, Roger Pau Monné wrote:
>> On Fri, May 20, 2022 at 02:36:02PM +0200, Jan Beulich wrote:
>>> On 20.05.2022 14:22, Roger Pau Monné wrote:
>>>> On Fri, May 20, 2022 at 01:13:28PM +0200, Jan Beulich wrote:
>>>>> On 20.05.2022 13:11, Jan Beulich wrote:
>>>>>> On 20.05.2022 12:47, Roger Pau Monné wrote:
>>>>>>> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
>>>>>>>> On 06.05.2022 13:16, Roger Pau Monné wrote:
>>>>>>>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
>>>>>>>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>>>>>>>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>>>>>>>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
>>>>>>>>>>  
>>>>>>>>>>      while ( nr_ptes-- )
>>>>>>>>>>      {
>>>>>>>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
>>>>>>>>>> +        ASSERT(!pde->next_level);
>>>>>>>>>> +        ASSERT(!pde->u);
>>>>>>>>>> +
>>>>>>>>>> +        if ( pde > table )
>>>>>>>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
>>>>>>>>>> +        else
>>>>>>>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
>>>>>>>>>
>>>>>>>>> I think PAGETABLE_ORDER would be clearer here.
>>>>>>>>
>>>>>>>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
>>>>>>>> in IOMMU code afaics.
>>>>>>>
>>>>>>> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
>>>>>>> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
>>>>>>> IOMMU code  but not PAGETABLE_ORDER.
>>>>>>
>>>>>> Hmm, yes and no. But for consistency with other IOMMU code I may want
>>>>>> to switch to PAGE_SHIFT_4K.
>>>>>
>>>>> Except that, with the plan to re-use pt_update_contig_markers() for CPU-
>>>>> side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.
>>>>
>>>> Then can PAGETABLE_ORDER be used instead of PAGE_SHIFT - 3?
>>>
>>> pt_update_contig_markers() isn't IOMMU code; since I've said I'd switch
>>> to PAGE_SHIFT_4K in IOMMU code I'm having a hard time seeing how I could
>>> at the same time start using PAGETABLE_ORDER there.
>>
>> I've got confused by the double reply and read it as if you where
>> going to stick to using PAGE_SHIFT everywhere as proposed originally.
>>
>>> What I maybe could do is use PTE_PER_TABLE_SHIFT in AMD code and
>>> LEVEL_STRIDE in VT-d one. Yet I'm not sure that would be fully correct/
>>> consistent, ...
>>>
>>>> IMO it makes the code quite easier to understand.
>>>
>>> ... or in fact helping readability.
>>
>> Looking at pt_update_contig_markers() we hardcode CONTIG_LEVEL_SHIFT
>> to 9 there, which means all users must have a page table order of 9.
>>
>> It seems to me we are just making things more complicated than
>> necessary by trying to avoid dependencies between CPU and IOMMU
>> page-table sizes and definitions, when the underlying mechanism to set
>> ->ign0 has those assumptions baked in.
>>
>> Would it help if you introduced a PAGE_TABLE_ORDER in page-defs.h?
> 
> Sorry, should be PAGE_TABLE_ORDER_4K.

Oh, good that I looked here before replying to the earlier mail: I'm
afraid I view PAGE_TABLE_ORDER_4K as not very useful. From an
abstract POV, what is the base unit meant to be that the order is
is based upon? PAGE_SHIFT? Or PAGE_SHIFT_4K? I think such an
ambiguity is going to remain even if we very clearly spelled out what
we mean things to be, as one would always need to go back to that
comment to check which of the two possible ways it is.

Furthermore I'm not convinced PAGETABLE_ORDER is really meant to be
associated with a particular page size anyway: PAGE_TABLE_ORDER_2M
imo makes no sense at all. And page-defs.h is not supposed to
express any platform properties anyway, it's merely an accumulation
of (believed) useful constants.

Hence the only thing which I might see as a (remote) option is
IOMMU_PAGE_TABLE_ORDER (for platforms where all IOMMU variants have
all page table levels using identical sizes, which isn't a given, but
which would hold for x86 and hence for the purpose here).

Jan
Roger Pau Monne May 23, 2022, 9:10 a.m. UTC | #11
On Mon, May 23, 2022 at 08:49:27AM +0200, Jan Beulich wrote:
> On 20.05.2022 16:38, Roger Pau Monné wrote:
> > On Fri, May 20, 2022 at 04:28:14PM +0200, Roger Pau Monné wrote:
> >> On Fri, May 20, 2022 at 02:36:02PM +0200, Jan Beulich wrote:
> >>> On 20.05.2022 14:22, Roger Pau Monné wrote:
> >>>> On Fri, May 20, 2022 at 01:13:28PM +0200, Jan Beulich wrote:
> >>>>> On 20.05.2022 13:11, Jan Beulich wrote:
> >>>>>> On 20.05.2022 12:47, Roger Pau Monné wrote:
> >>>>>>> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
> >>>>>>>> On 06.05.2022 13:16, Roger Pau Monné wrote:
> >>>>>>>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
> >>>>>>>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
> >>>>>>>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
> >>>>>>>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
> >>>>>>>>>>  
> >>>>>>>>>>      while ( nr_ptes-- )
> >>>>>>>>>>      {
> >>>>>>>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
> >>>>>>>>>> +        ASSERT(!pde->next_level);
> >>>>>>>>>> +        ASSERT(!pde->u);
> >>>>>>>>>> +
> >>>>>>>>>> +        if ( pde > table )
> >>>>>>>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
> >>>>>>>>>> +        else
> >>>>>>>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
> >>>>>>>>>
> >>>>>>>>> I think PAGETABLE_ORDER would be clearer here.
> >>>>>>>>
> >>>>>>>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
> >>>>>>>> in IOMMU code afaics.
> >>>>>>>
> >>>>>>> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
> >>>>>>> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
> >>>>>>> IOMMU code  but not PAGETABLE_ORDER.
> >>>>>>
> >>>>>> Hmm, yes and no. But for consistency with other IOMMU code I may want
> >>>>>> to switch to PAGE_SHIFT_4K.
> >>>>>
> >>>>> Except that, with the plan to re-use pt_update_contig_markers() for CPU-
> >>>>> side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.
> >>>>
> >>>> Then can PAGETABLE_ORDER be used instead of PAGE_SHIFT - 3?
> >>>
> >>> pt_update_contig_markers() isn't IOMMU code; since I've said I'd switch
> >>> to PAGE_SHIFT_4K in IOMMU code I'm having a hard time seeing how I could
> >>> at the same time start using PAGETABLE_ORDER there.
> >>
> >> I've got confused by the double reply and read it as if you where
> >> going to stick to using PAGE_SHIFT everywhere as proposed originally.
> >>
> >>> What I maybe could do is use PTE_PER_TABLE_SHIFT in AMD code and
> >>> LEVEL_STRIDE in VT-d one. Yet I'm not sure that would be fully correct/
> >>> consistent, ...
> >>>
> >>>> IMO it makes the code quite easier to understand.
> >>>
> >>> ... or in fact helping readability.
> >>
> >> Looking at pt_update_contig_markers() we hardcode CONTIG_LEVEL_SHIFT
> >> to 9 there, which means all users must have a page table order of 9.
> >>
> >> It seems to me we are just making things more complicated than
> >> necessary by trying to avoid dependencies between CPU and IOMMU
> >> page-table sizes and definitions, when the underlying mechanism to set
> >> ->ign0 has those assumptions baked in.
> >>
> >> Would it help if you introduced a PAGE_TABLE_ORDER in page-defs.h?
> > 
> > Sorry, should be PAGE_TABLE_ORDER_4K.
> 
> Oh, good that I looked here before replying to the earlier mail: I'm
> afraid I view PAGE_TABLE_ORDER_4K as not very useful. From an
> abstract POV, what is the base unit meant to be that the order is
> is based upon? PAGE_SHIFT? Or PAGE_SHIFT_4K? I think such an
> ambiguity is going to remain even if we very clearly spelled out what
> we mean things to be, as one would always need to go back to that
> comment to check which of the two possible ways it is.
> 
> Furthermore I'm not convinced PAGETABLE_ORDER is really meant to be
> associated with a particular page size anyway: PAGE_TABLE_ORDER_2M
> imo makes no sense at all. And page-defs.h is not supposed to
> express any platform properties anyway, it's merely an accumulation
> of (believed) useful constants.
> 
> Hence the only thing which I might see as a (remote) option is
> IOMMU_PAGE_TABLE_ORDER (for platforms where all IOMMU variants have
> all page table levels using identical sizes, which isn't a given, but
> which would hold for x86 and hence for the purpose here).

Since you already define a page table order in pt-contig-markers.h
(CONTIG_NR) it might be possible to export and use that?  In fact the
check done here would be even more accurate if it was done using the
same constant that's used in pt_update_contig_markers(), because the
purpose here is to check that the vendor specific code to init the
page tables has used the correct value.

Thanks, Roger.
Jan Beulich May 23, 2022, 10:52 a.m. UTC | #12
On 23.05.2022 11:10, Roger Pau Monné wrote:
> On Mon, May 23, 2022 at 08:49:27AM +0200, Jan Beulich wrote:
>> On 20.05.2022 16:38, Roger Pau Monné wrote:
>>> On Fri, May 20, 2022 at 04:28:14PM +0200, Roger Pau Monné wrote:
>>>> On Fri, May 20, 2022 at 02:36:02PM +0200, Jan Beulich wrote:
>>>>> On 20.05.2022 14:22, Roger Pau Monné wrote:
>>>>>> On Fri, May 20, 2022 at 01:13:28PM +0200, Jan Beulich wrote:
>>>>>>> On 20.05.2022 13:11, Jan Beulich wrote:
>>>>>>>> On 20.05.2022 12:47, Roger Pau Monné wrote:
>>>>>>>>> On Thu, May 19, 2022 at 02:12:04PM +0200, Jan Beulich wrote:
>>>>>>>>>> On 06.05.2022 13:16, Roger Pau Monné wrote:
>>>>>>>>>>> On Mon, Apr 25, 2022 at 10:40:55AM +0200, Jan Beulich wrote:
>>>>>>>>>>>> --- a/xen/drivers/passthrough/amd/iommu_map.c
>>>>>>>>>>>> +++ b/xen/drivers/passthrough/amd/iommu_map.c
>>>>>>>>>>>> @@ -115,7 +115,19 @@ static void set_iommu_ptes_present(unsig
>>>>>>>>>>>>  
>>>>>>>>>>>>      while ( nr_ptes-- )
>>>>>>>>>>>>      {
>>>>>>>>>>>> -        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
>>>>>>>>>>>> +        ASSERT(!pde->next_level);
>>>>>>>>>>>> +        ASSERT(!pde->u);
>>>>>>>>>>>> +
>>>>>>>>>>>> +        if ( pde > table )
>>>>>>>>>>>> +            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
>>>>>>>>>>>> +        else
>>>>>>>>>>>> +            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
>>>>>>>>>>>
>>>>>>>>>>> I think PAGETABLE_ORDER would be clearer here.
>>>>>>>>>>
>>>>>>>>>> I disagree - PAGETABLE_ORDER is a CPU-side concept. It's not used anywhere
>>>>>>>>>> in IOMMU code afaics.
>>>>>>>>>
>>>>>>>>> Isn't PAGE_SHIFT also a CPU-side concept in the same way?  I'm not
>>>>>>>>> sure what's the rule for declaring that PAGE_SHIFT is fine to use in
>>>>>>>>> IOMMU code  but not PAGETABLE_ORDER.
>>>>>>>>
>>>>>>>> Hmm, yes and no. But for consistency with other IOMMU code I may want
>>>>>>>> to switch to PAGE_SHIFT_4K.
>>>>>>>
>>>>>>> Except that, with the plan to re-use pt_update_contig_markers() for CPU-
>>>>>>> side re-coalescing, there I'd prefer to stick to PAGE_SHIFT.
>>>>>>
>>>>>> Then can PAGETABLE_ORDER be used instead of PAGE_SHIFT - 3?
>>>>>
>>>>> pt_update_contig_markers() isn't IOMMU code; since I've said I'd switch
>>>>> to PAGE_SHIFT_4K in IOMMU code I'm having a hard time seeing how I could
>>>>> at the same time start using PAGETABLE_ORDER there.
>>>>
>>>> I've got confused by the double reply and read it as if you where
>>>> going to stick to using PAGE_SHIFT everywhere as proposed originally.
>>>>
>>>>> What I maybe could do is use PTE_PER_TABLE_SHIFT in AMD code and
>>>>> LEVEL_STRIDE in VT-d one. Yet I'm not sure that would be fully correct/
>>>>> consistent, ...
>>>>>
>>>>>> IMO it makes the code quite easier to understand.
>>>>>
>>>>> ... or in fact helping readability.
>>>>
>>>> Looking at pt_update_contig_markers() we hardcode CONTIG_LEVEL_SHIFT
>>>> to 9 there, which means all users must have a page table order of 9.
>>>>
>>>> It seems to me we are just making things more complicated than
>>>> necessary by trying to avoid dependencies between CPU and IOMMU
>>>> page-table sizes and definitions, when the underlying mechanism to set
>>>> ->ign0 has those assumptions baked in.
>>>>
>>>> Would it help if you introduced a PAGE_TABLE_ORDER in page-defs.h?
>>>
>>> Sorry, should be PAGE_TABLE_ORDER_4K.
>>
>> Oh, good that I looked here before replying to the earlier mail: I'm
>> afraid I view PAGE_TABLE_ORDER_4K as not very useful. From an
>> abstract POV, what is the base unit meant to be that the order is
>> is based upon? PAGE_SHIFT? Or PAGE_SHIFT_4K? I think such an
>> ambiguity is going to remain even if we very clearly spelled out what
>> we mean things to be, as one would always need to go back to that
>> comment to check which of the two possible ways it is.
>>
>> Furthermore I'm not convinced PAGETABLE_ORDER is really meant to be
>> associated with a particular page size anyway: PAGE_TABLE_ORDER_2M
>> imo makes no sense at all. And page-defs.h is not supposed to
>> express any platform properties anyway, it's merely an accumulation
>> of (believed) useful constants.
>>
>> Hence the only thing which I might see as a (remote) option is
>> IOMMU_PAGE_TABLE_ORDER (for platforms where all IOMMU variants have
>> all page table levels using identical sizes, which isn't a given, but
>> which would hold for x86 and hence for the purpose here).
> 
> Since you already define a page table order in pt-contig-markers.h
> (CONTIG_NR) it might be possible to export and use that?  In fact the
> check done here would be even more accurate if it was done using the
> same constant that's used in pt_update_contig_markers(), because the
> purpose here is to check that the vendor specific code to init the
> page tables has used the correct value.

Hmm, yes, let me do that. It'll be a little odd in the header itself
(as I'll need to exclude the bulk of it when CONTIG_MASK is not
defined), but apart from that it should indeed end up being better.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/include/asm/iommu.h
+++ b/xen/arch/x86/include/asm/iommu.h
@@ -146,7 +146,8 @@  void iommu_free_domid(domid_t domid, uns
 
 int __must_check iommu_free_pgtables(struct domain *d);
 struct domain_iommu;
-struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
+struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd,
+                                                   uint64_t contig_mask);
 void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
 
 #endif /* !__ARCH_X86_IOMMU_H__ */
--- a/xen/drivers/passthrough/amd/iommu-defs.h
+++ b/xen/drivers/passthrough/amd/iommu-defs.h
@@ -446,11 +446,13 @@  union amd_iommu_x2apic_control {
 #define IOMMU_PAGE_TABLE_U32_PER_ENTRY	(IOMMU_PAGE_TABLE_ENTRY_SIZE / 4)
 #define IOMMU_PAGE_TABLE_ALIGNMENT	4096
 
+#define IOMMU_PTE_CONTIG_MASK           0x1e /* The ign0 field below. */
+
 union amd_iommu_pte {
     uint64_t raw;
     struct {
         bool pr:1;
-        unsigned int ign0:4;
+        unsigned int ign0:4; /* Covered by IOMMU_PTE_CONTIG_MASK. */
         bool a:1;
         bool d:1;
         unsigned int ign1:2;
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -115,7 +115,19 @@  static void set_iommu_ptes_present(unsig
 
     while ( nr_ptes-- )
     {
-        set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
+        ASSERT(!pde->next_level);
+        ASSERT(!pde->u);
+
+        if ( pde > table )
+            ASSERT(pde->ign0 == find_first_set_bit(pde - table));
+        else
+            ASSERT(pde->ign0 == PAGE_SHIFT - 3);
+
+        pde->iw = iw;
+        pde->ir = ir;
+        pde->fc = true; /* See set_iommu_pde_present(). */
+        pde->mfn = next_mfn;
+        pde->pr = true;
 
         ++pde;
         next_mfn += page_sz;
@@ -295,7 +307,7 @@  static int iommu_pde_from_dfn(struct dom
             mfn = next_table_mfn;
 
             /* allocate lower level page table */
-            table = iommu_alloc_pgtable(hd);
+            table = iommu_alloc_pgtable(hd, IOMMU_PTE_CONTIG_MASK);
             if ( table == NULL )
             {
                 AMD_IOMMU_ERROR("cannot allocate I/O page table\n");
@@ -325,7 +337,7 @@  static int iommu_pde_from_dfn(struct dom
 
             if ( next_table_mfn == 0 )
             {
-                table = iommu_alloc_pgtable(hd);
+                table = iommu_alloc_pgtable(hd, IOMMU_PTE_CONTIG_MASK);
                 if ( table == NULL )
                 {
                     AMD_IOMMU_ERROR("cannot allocate I/O page table\n");
@@ -717,7 +729,7 @@  static int fill_qpt(union amd_iommu_pte
                  * page table pages, and the resulting allocations are always
                  * zeroed.
                  */
-                pgs[level] = iommu_alloc_pgtable(hd);
+                pgs[level] = iommu_alloc_pgtable(hd, 0);
                 if ( !pgs[level] )
                 {
                     rc = -ENOMEM;
@@ -775,7 +787,7 @@  int cf_check amd_iommu_quarantine_init(s
         return 0;
     }
 
-    pdev->arch.amd.root_table = iommu_alloc_pgtable(hd);
+    pdev->arch.amd.root_table = iommu_alloc_pgtable(hd, 0);
     if ( !pdev->arch.amd.root_table )
         return -ENOMEM;
 
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -342,7 +342,7 @@  int amd_iommu_alloc_root(struct domain *
 
     if ( unlikely(!hd->arch.amd.root_table) && d != dom_io )
     {
-        hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
+        hd->arch.amd.root_table = iommu_alloc_pgtable(hd, 0);
         if ( !hd->arch.amd.root_table )
             return -ENOMEM;
     }
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -334,7 +334,7 @@  static uint64_t addr_to_dma_page_maddr(s
             goto out;
 
         pte_maddr = level;
-        if ( !(pg = iommu_alloc_pgtable(hd)) )
+        if ( !(pg = iommu_alloc_pgtable(hd, 0)) )
             goto out;
 
         hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
@@ -376,7 +376,7 @@  static uint64_t addr_to_dma_page_maddr(s
             }
 
             pte_maddr = level - 1;
-            pg = iommu_alloc_pgtable(hd);
+            pg = iommu_alloc_pgtable(hd, DMA_PTE_CONTIG_MASK);
             if ( !pg )
                 break;
 
@@ -388,12 +388,13 @@  static uint64_t addr_to_dma_page_maddr(s
                 struct dma_pte *split = map_vtd_domain_page(pte_maddr);
                 unsigned long inc = 1UL << level_to_offset_bits(level - 1);
 
-                split[0].val = pte->val;
+                split[0].val |= pte->val & ~DMA_PTE_CONTIG_MASK;
                 if ( inc == PAGE_SIZE )
                     split[0].val &= ~DMA_PTE_SP;
 
                 for ( offset = 1; offset < PTE_NUM; ++offset )
-                    split[offset].val = split[offset - 1].val + inc;
+                    split[offset].val |=
+                        (split[offset - 1].val & ~DMA_PTE_CONTIG_MASK) + inc;
 
                 iommu_sync_cache(split, PAGE_SIZE);
                 unmap_vtd_domain_page(split);
@@ -2173,7 +2174,7 @@  static int __must_check cf_check intel_i
     if ( iommu_snoop )
         dma_set_pte_snp(new);
 
-    if ( old.val == new.val )
+    if ( !((old.val ^ new.val) & ~DMA_PTE_CONTIG_MASK) )
     {
         spin_unlock(&hd->arch.mapping_lock);
         unmap_vtd_domain_page(page);
@@ -3052,7 +3053,7 @@  static int fill_qpt(struct dma_pte *this
                  * page table pages, and the resulting allocations are always
                  * zeroed.
                  */
-                pgs[level] = iommu_alloc_pgtable(hd);
+                pgs[level] = iommu_alloc_pgtable(hd, 0);
                 if ( !pgs[level] )
                 {
                     rc = -ENOMEM;
@@ -3109,7 +3110,7 @@  static int cf_check intel_iommu_quaranti
     if ( !drhd )
         return -ENODEV;
 
-    pg = iommu_alloc_pgtable(hd);
+    pg = iommu_alloc_pgtable(hd, 0);
     if ( !pg )
         return -ENOMEM;
 
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -253,7 +253,10 @@  struct context_entry {
  * 2-6: reserved
  * 7: super page
  * 8-11: available
- * 12-63: Host physcial address
+ * 12-51: Host physcial address
+ * 52-61: available (52-55 used for DMA_PTE_CONTIG_MASK)
+ * 62: reserved
+ * 63: available
  */
 struct dma_pte {
     u64 val;
@@ -263,6 +266,7 @@  struct dma_pte {
 #define DMA_PTE_PROT (DMA_PTE_READ | DMA_PTE_WRITE)
 #define DMA_PTE_SP   (1 << 7)
 #define DMA_PTE_SNP  (1 << 11)
+#define DMA_PTE_CONTIG_MASK  (0xfull << PADDR_BITS)
 #define dma_clear_pte(p)    do {(p).val = 0;} while(0)
 #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0)
 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0)
@@ -276,7 +280,7 @@  struct dma_pte {
 #define dma_pte_write(p) (dma_pte_prot(p) & DMA_PTE_WRITE)
 #define dma_pte_addr(p) ((p).val & PADDR_MASK & PAGE_MASK_4K)
 #define dma_set_pte_addr(p, addr) do {\
-            (p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+            (p).val |= ((addr) & PADDR_MASK & PAGE_MASK_4K); } while (0)
 #define dma_pte_present(p) (((p).val & DMA_PTE_PROT) != 0)
 #define dma_pte_superpage(p) (((p).val & DMA_PTE_SP) != 0)
 
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -522,11 +522,12 @@  int iommu_free_pgtables(struct domain *d
     return 0;
 }
 
-struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd)
+struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd,
+                                      uint64_t contig_mask)
 {
     unsigned int memflags = 0;
     struct page_info *pg;
-    void *p;
+    uint64_t *p;
 
 #ifdef CONFIG_NUMA
     if ( hd->node != NUMA_NO_NODE )
@@ -538,7 +539,29 @@  struct page_info *iommu_alloc_pgtable(st
         return NULL;
 
     p = __map_domain_page(pg);
-    clear_page(p);
+
+    if ( contig_mask )
+    {
+        /* See pt-contig-markers.h for a description of the marker scheme. */
+        unsigned int i, shift = find_first_set_bit(contig_mask);
+
+        ASSERT(((PAGE_SHIFT - 3) & (contig_mask >> shift)) == PAGE_SHIFT - 3);
+
+        p[0] = (PAGE_SHIFT - 3ull) << shift;
+        p[1] = 0;
+        p[2] = 1ull << shift;
+        p[3] = 0;
+
+        for ( i = 4; i < PAGE_SIZE / 8; i += 4 )
+        {
+            p[i + 0] = (find_first_set_bit(i) + 0ull) << shift;
+            p[i + 1] = 0;
+            p[i + 2] = 1ull << shift;
+            p[i + 3] = 0;
+        }
+    }
+    else
+        clear_page(p);
 
     iommu_sync_cache(p, PAGE_SIZE);