diff mbox

[v2,19/20] xen/privcmd: Add support for Linux 64KB page granularity

Message ID 1436474552-31789-20-git-send-email-julien.grall@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Julien Grall July 9, 2015, 8:42 p.m. UTC
The hypercall interface (as well as the toolstack) is always using 4KB
page granularity. When the toolstack is asking for mapping a series of
guest PFN in a batch, it expects to have the page map contiguously in
its virtual memory.

When Linux is using 64KB page granularity, the privcmd driver will have
to map multiple Xen PFN in a single Linux page.

Note that this solution works on page granularity which is a multiple of
4KB.

Signed-off-by: Julien Grall <julien.grall@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
---
Changes in v2:
    - Use xen_apply_to_page
---
 drivers/xen/privcmd.c   |   8 +--
 drivers/xen/xlate_mmu.c | 127 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 92 insertions(+), 43 deletions(-)

Comments

Boris Ostrovsky July 13, 2015, 8:13 p.m. UTC | #1
On 07/09/2015 04:42 PM, Julien Grall wrote:
> -
>   struct remap_data {
>   	xen_pfn_t *fgmfn; /* foreign domain's gmfn */
> +	xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */

It might be better to keep size of fgmfn array instead.


>
> +static int unmap_gfn(struct page *page, unsigned long pfn, void *data)
> +{
> +	int *nr = data;
> +	struct xen_remove_from_physmap xrp;
> +
> +	/* The Linux Page may not have been fully mapped to Xen */
> +	if (!*nr)
> +		return 0;
> +
> +	xrp.domid = DOMID_SELF;
> +	xrp.gpfn = pfn;
> +	(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
> +
> +	(*nr)--;
> +
> +	return 0;
> +}
> +
>   int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>   			      int nr, struct page **pages)
>   {
>   	int i;
> +	int nr_page = round_up(nr, XEN_PFN_PER_PAGE);
>   
> -	for (i = 0; i < nr; i++) {
> -		struct xen_remove_from_physmap xrp;
> -		unsigned long pfn;
> +	for (i = 0; i < nr_page; i++) {
> +		/* unmap_gfn guarantees ret == 0 */
> +		BUG_ON(xen_apply_to_page(pages[i], unmap_gfn, &nr));


TBH, I am not sure how useful xen_apply_to_page() routine is. In this 
patch especially, but also in others.

-boris
Julien Grall July 13, 2015, 10:05 p.m. UTC | #2
Hi Boris,

On 13/07/2015 22:13, Boris Ostrovsky wrote:
> On 07/09/2015 04:42 PM, Julien Grall wrote:
>> -
>>   struct remap_data {
>>       xen_pfn_t *fgmfn; /* foreign domain's gmfn */
>> +    xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */
>
> It might be better to keep size of fgmfn array instead.

It would means to have an other variable to check that we are at the end 
the array.

What about a variable which will be decremented?

>>
>> +static int unmap_gfn(struct page *page, unsigned long pfn, void *data)
>> +{
>> +    int *nr = data;
>> +    struct xen_remove_from_physmap xrp;
>> +
>> +    /* The Linux Page may not have been fully mapped to Xen */
>> +    if (!*nr)
>> +        return 0;
>> +
>> +    xrp.domid = DOMID_SELF;
>> +    xrp.gpfn = pfn;
>> +    (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
>> +
>> +    (*nr)--;
>> +
>> +    return 0;
>> +}
>> +
>>   int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>>                     int nr, struct page **pages)
>>   {
>>       int i;
>> +    int nr_page = round_up(nr, XEN_PFN_PER_PAGE);
>> -    for (i = 0; i < nr; i++) {
>> -        struct xen_remove_from_physmap xrp;
>> -        unsigned long pfn;
>> +    for (i = 0; i < nr_page; i++) {
>> +        /* unmap_gfn guarantees ret == 0 */
>> +        BUG_ON(xen_apply_to_page(pages[i], unmap_gfn, &nr));
>
>
> TBH, I am not sure how useful xen_apply_to_page() routine is. In this
> patch especially, but also in others.

It avoids an open loop in each place where it's needed (here, 
balloon...) which means another indentation layer. You can give a look 
it's quite ugly.

Furthermore, the helper will avoid possible done by developers who are 
working on PV drivers on x86. If you see code where the MFN translation 
is done directly via virt_to_mfn or page_to_mfn... it will likely means 
that the code will be broking when 64KB page granularity will be in used.
Though, there will still be some place where it's valid to use 
virt_to_mfn and page_to_mfn.

Regards,
Boris Ostrovsky July 14, 2015, 3:28 p.m. UTC | #3
On 07/13/2015 06:05 PM, Julien Grall wrote:
> Hi Boris,
>
> On 13/07/2015 22:13, Boris Ostrovsky wrote:
>> On 07/09/2015 04:42 PM, Julien Grall wrote:
>>> -
>>>   struct remap_data {
>>>       xen_pfn_t *fgmfn; /* foreign domain's gmfn */
>>> +    xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */
>>
>> It might be better to keep size of fgmfn array instead.
>
> It would means to have an other variable to check that we are at the 
> end the array.


I thought that's what h_iter is. Is it not?


>
> What about a variable which will be decremented?
>
>>>
>>> +static int unmap_gfn(struct page *page, unsigned long pfn, void *data)
>>> +{
>>> +    int *nr = data;
>>> +    struct xen_remove_from_physmap xrp;
>>> +
>>> +    /* The Linux Page may not have been fully mapped to Xen */
>>> +    if (!*nr)
>>> +        return 0;
>>> +
>>> +    xrp.domid = DOMID_SELF;
>>> +    xrp.gpfn = pfn;
>>> +    (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
>>> +
>>> +    (*nr)--;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>>>                     int nr, struct page **pages)
>>>   {
>>>       int i;
>>> +    int nr_page = round_up(nr, XEN_PFN_PER_PAGE);
>>> -    for (i = 0; i < nr; i++) {
>>> -        struct xen_remove_from_physmap xrp;
>>> -        unsigned long pfn;
>>> +    for (i = 0; i < nr_page; i++) {
>>> +        /* unmap_gfn guarantees ret == 0 */
>>> +        BUG_ON(xen_apply_to_page(pages[i], unmap_gfn, &nr));
>>
>>
>> TBH, I am not sure how useful xen_apply_to_page() routine is. In this
>> patch especially, but also in others.
>
> It avoids an open loop in each place where it's needed (here, 
> balloon...) which means another indentation layer. You can give a look 
> it's quite ugly.

I didn't notice that it was an inline, in which case it is indeed cleaner.

-boris

>
> Furthermore, the helper will avoid possible done by developers who are 
> working on PV drivers on x86. If you see code where the MFN 
> translation is done directly via virt_to_mfn or page_to_mfn... it will 
> likely means that the code will be broking when 64KB page granularity 
> will be in used.
> Though, there will still be some place where it's valid to use 
> virt_to_mfn and page_to_mfn.
>
> Regards,
>
Julien Grall July 14, 2015, 3:37 p.m. UTC | #4
Hi Boris,

On 14/07/2015 17:28, Boris Ostrovsky wrote:
> On 07/13/2015 06:05 PM, Julien Grall wrote:
>> On 13/07/2015 22:13, Boris Ostrovsky wrote:
>>> On 07/09/2015 04:42 PM, Julien Grall wrote:
>>>> -
>>>>   struct remap_data {
>>>>       xen_pfn_t *fgmfn; /* foreign domain's gmfn */
>>>> +    xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */
>>>
>>> It might be better to keep size of fgmfn array instead.
>>
>> It would means to have an other variable to check that we are at the
>> end the array.
>
>
> I thought that's what h_iter is. Is it not?

h_iter is for the number of xen pfn in a Linux page. This is because the 
Linux privcmd interface is working with 4KB page and there may not be 
enough to fill a 64KB page.

So we need another counter for the total number of foreign domain's gmfn.

Regards,
Stefano Stabellini July 16, 2015, 5:12 p.m. UTC | #5
On Thu, 9 Jul 2015, Julien Grall wrote:
> The hypercall interface (as well as the toolstack) is always using 4KB
> page granularity. When the toolstack is asking for mapping a series of
> guest PFN in a batch, it expects to have the page map contiguously in
> its virtual memory.
> 
> When Linux is using 64KB page granularity, the privcmd driver will have
> to map multiple Xen PFN in a single Linux page.
> 
> Note that this solution works on page granularity which is a multiple of
> 4KB.
> 
> Signed-off-by: Julien Grall <julien.grall@citrix.com>
> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: David Vrabel <david.vrabel@citrix.com>
> ---
> Changes in v2:
>     - Use xen_apply_to_page
> ---
>  drivers/xen/privcmd.c   |   8 +--
>  drivers/xen/xlate_mmu.c | 127 +++++++++++++++++++++++++++++++++---------------
>  2 files changed, 92 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index 5a29616..e8714b4 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -446,7 +446,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>  		return -EINVAL;
>  	}
>  
> -	nr_pages = m.num;
> +	nr_pages = DIV_ROUND_UP_ULL(m.num, PAGE_SIZE / XEN_PAGE_SIZE);
>  	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
>  		return -EINVAL;

DIV_ROUND_UP is enough, neither arguments are unsigned long long


> @@ -494,7 +494,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>  			goto out_unlock;
>  		}
>  		if (xen_feature(XENFEAT_auto_translated_physmap)) {
> -			ret = alloc_empty_pages(vma, m.num);
> +			ret = alloc_empty_pages(vma, nr_pages);
>  			if (ret < 0)
>  				goto out_unlock;
>  		} else
> @@ -518,6 +518,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>  	state.global_error  = 0;
>  	state.version       = version;
>  
> +	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
>  	/* mmap_batch_fn guarantees ret == 0 */
>  	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
>  				    &pagelist, mmap_batch_fn, &state));
> @@ -582,12 +583,13 @@ static void privcmd_close(struct vm_area_struct *vma)
>  {
>  	struct page **pages = vma->vm_private_data;
>  	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> +	int nr_pfn = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
>  	int rc;
>  
>  	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
>  		return;
>  
> -	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
> +	rc = xen_unmap_domain_mfn_range(vma, nr_pfn, pages);
>  	if (rc == 0)
>  		free_xenballooned_pages(numpgs, pages);

If you intend to pass the number of xen pages as nr argument to
xen_unmap_domain_mfn_range, then I think that the changes to
xen_xlate_unmap_gfn_range below are wrong.


>  	else
> diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
> index 58a5389..1fac17c 100644
> --- a/drivers/xen/xlate_mmu.c
> +++ b/drivers/xen/xlate_mmu.c
> @@ -38,31 +38,9 @@
>  #include <xen/interface/xen.h>
>  #include <xen/interface/memory.h>
>  
> -/* map fgmfn of domid to lpfn in the current domain */
> -static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
> -			    unsigned int domid)
> -{
> -	int rc;
> -	struct xen_add_to_physmap_range xatp = {
> -		.domid = DOMID_SELF,
> -		.foreign_domid = domid,
> -		.size = 1,
> -		.space = XENMAPSPACE_gmfn_foreign,
> -	};
> -	xen_ulong_t idx = fgmfn;
> -	xen_pfn_t gpfn = lpfn;
> -	int err = 0;
> -
> -	set_xen_guest_handle(xatp.idxs, &idx);
> -	set_xen_guest_handle(xatp.gpfns, &gpfn);
> -	set_xen_guest_handle(xatp.errs, &err);
> -
> -	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
> -	return rc < 0 ? rc : err;
> -}
> -
>  struct remap_data {
>  	xen_pfn_t *fgmfn; /* foreign domain's gmfn */
> +	xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */
>  	pgprot_t prot;
>  	domid_t  domid;
>  	struct vm_area_struct *vma;
> @@ -71,24 +49,75 @@ struct remap_data {
>  	struct xen_remap_mfn_info *info;
>  	int *err_ptr;
>  	int mapped;
> +
> +	/* Hypercall parameters */
> +	int h_errs[XEN_PFN_PER_PAGE];
> +	xen_ulong_t h_idxs[XEN_PFN_PER_PAGE];
> +	xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE];

I don't think you should be adding these fields to struct remap_data:
struct remap_data is used to pass multi pages arguments from
xen_xlate_remap_gfn_array to remap_pte_fn.

I think you need to introduce a different struct to pass per linux page
arguments from remap_pte_fn to setup_hparams.


> +	int h_iter;	/* Iterator */
>  };
>  
> +static int setup_hparams(struct page *page, unsigned long pfn, void *data)
> +{
> +	struct remap_data *info = data;
> +
> +	/* We may not have enough domain's gmfn to fill a Linux Page */
> +	if (info->fgmfn == info->efgmfn)
> +		return 0;
> +
> +	info->h_idxs[info->h_iter] = *info->fgmfn;
> +	info->h_gpfns[info->h_iter] = pfn;
> +	info->h_errs[info->h_iter] = 0;
> +	info->h_iter++;
> +
> +	info->fgmfn++;
> +
> +	return 0;
> +}
> +
>  static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
>  			void *data)
>  {
>  	struct remap_data *info = data;
>  	struct page *page = info->pages[info->index++];
> -	unsigned long pfn = page_to_pfn(page);
> -	pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
> +	pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), info->prot));
>  	int rc;
> +	uint32_t i;
> +	struct xen_add_to_physmap_range xatp = {
> +		.domid = DOMID_SELF,
> +		.foreign_domid = info->domid,
> +		.space = XENMAPSPACE_gmfn_foreign,
> +	};
>  
> -	rc = map_foreign_page(pfn, *info->fgmfn, info->domid);
> -	*info->err_ptr++ = rc;
> -	if (!rc) {
> -		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
> -		info->mapped++;
> +	info->h_iter = 0;
> +
> +	/* setup_hparams guarantees ret == 0 */
> +	BUG_ON(xen_apply_to_page(page, setup_hparams, info));
> +
> +	set_xen_guest_handle(xatp.idxs, info->h_idxs);
> +	set_xen_guest_handle(xatp.gpfns, info->h_gpfns);
> +	set_xen_guest_handle(xatp.errs, info->h_errs);
> +	xatp.size = info->h_iter;
> +
> +	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);

I would have thought that XENMEM_add_to_physmap_range operates at 4K
granularity, regardless of how the guest decides to layout its
pagetables. If so, the call to XENMEM_add_to_physmap_range needs to be
moved within the function passed to xen_apply_to_page.


> +	/* info->err_ptr expect to have one error status per Xen PFN */
> +	for (i = 0; i < info->h_iter; i++) {
> +		int err = (rc < 0) ? rc : info->h_errs[i];
> +
> +		*(info->err_ptr++) = err;
> +		if (!err)
> +			info->mapped++;
>  	}
> -	info->fgmfn++;
> +
> +	/*
> +	 * Note: The hypercall will return 0 in most of the case if even if
                                         ^ in most cases

> +	 * all the fgmfn are not mapped. We still have to update the pte
       ^ not all the fgmfn are mapped.

> +	 * as the userspace may decide to continue.
> +	 */
> +	if (!rc)
> +		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
>  
>  	return 0;
>  }
> @@ -102,13 +131,14 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>  {
>  	int err;
>  	struct remap_data data;
> -	unsigned long range = nr << PAGE_SHIFT;
> +	unsigned long range = round_up(nr, XEN_PFN_PER_PAGE) << XEN_PAGE_SHIFT;

If would just BUG_ON(nr % XEN_PFN_PER_PAGE) and avoid the round_up;



>  	/* Kept here for the purpose of making sure code doesn't break
>  	   x86 PVOPS */
>  	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
>  
>  	data.fgmfn = mfn;
> +	data.efgmfn = mfn + nr;

If we assume that nr is a multiple of XEN_PFN_PER_PAGE, then we can get
rid of efgmfn.


>  	data.prot  = prot;
>  	data.domid = domid;
>  	data.vma   = vma;
> @@ -123,21 +153,38 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>  }
>  EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
>  
> +static int unmap_gfn(struct page *page, unsigned long pfn, void *data)
> +{
> +	int *nr = data;
> +	struct xen_remove_from_physmap xrp;
> +
> +	/* The Linux Page may not have been fully mapped to Xen */
> +	if (!*nr)
> +		return 0;
> +
> +	xrp.domid = DOMID_SELF;
> +	xrp.gpfn = pfn;
> +	(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
> +
> +	(*nr)--;

I don't understand why you are passing nr as private argument. I would
just call XENMEM_remove_from_physmap unconditionally here. Am I missing
something? After all XENMEM_remove_from_physmap is just unmapping
at 4K granularity, right?


> +	return 0;
> +}
> +
>  int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>  			      int nr, struct page **pages)
>  {
>  	int i;
> +	int nr_page = round_up(nr, XEN_PFN_PER_PAGE);

If nr is the number of xen pages, then this should be:

    int nr_pages = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE);



> -	for (i = 0; i < nr; i++) {
> -		struct xen_remove_from_physmap xrp;
> -		unsigned long pfn;
> +	for (i = 0; i < nr_page; i++) {
> +		/* unmap_gfn guarantees ret == 0 */
> +		BUG_ON(xen_apply_to_page(pages[i], unmap_gfn, &nr));
> +	}
>  
> -		pfn = page_to_pfn(pages[i]);
> +	/* We should have consume every xen page */
                        ^ consumed


> +	BUG_ON(nr != 0);
>  
> -		xrp.domid = DOMID_SELF;
> -		xrp.gpfn = pfn;
> -		(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
> -	}
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range);
> -- 
> 2.1.4
>
Stefano Stabellini July 16, 2015, 5:16 p.m. UTC | #6
On Thu, 16 Jul 2015, Stefano Stabellini wrote:
> > +	/* setup_hparams guarantees ret == 0 */
> > +	BUG_ON(xen_apply_to_page(page, setup_hparams, info));
> > +
> > +	set_xen_guest_handle(xatp.idxs, info->h_idxs);
> > +	set_xen_guest_handle(xatp.gpfns, info->h_gpfns);
> > +	set_xen_guest_handle(xatp.errs, info->h_errs);
> > +	xatp.size = info->h_iter;
> > +
> > +	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
> 
> I would have thought that XENMEM_add_to_physmap_range operates at 4K
> granularity, regardless of how the guest decides to layout its
> pagetables. If so, the call to XENMEM_add_to_physmap_range needs to be
> moved within the function passed to xen_apply_to_page.

Sorry, this comment is wrong, please ignore it. The others are OK.
Julien Grall July 17, 2015, 12:50 p.m. UTC | #7
Hi Stefano,

On 16/07/15 18:12, Stefano Stabellini wrote:
> On Thu, 9 Jul 2015, Julien Grall wrote:
>> The hypercall interface (as well as the toolstack) is always using 4KB
>> page granularity. When the toolstack is asking for mapping a series of
>> guest PFN in a batch, it expects to have the page map contiguously in
>> its virtual memory.
>>
>> When Linux is using 64KB page granularity, the privcmd driver will have
>> to map multiple Xen PFN in a single Linux page.
>>
>> Note that this solution works on page granularity which is a multiple of
>> 4KB.
>>
>> Signed-off-by: Julien Grall <julien.grall@citrix.com>
>> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>> Cc: David Vrabel <david.vrabel@citrix.com>
>> ---
>> Changes in v2:
>>     - Use xen_apply_to_page
>> ---
>>  drivers/xen/privcmd.c   |   8 +--
>>  drivers/xen/xlate_mmu.c | 127 +++++++++++++++++++++++++++++++++---------------
>>  2 files changed, 92 insertions(+), 43 deletions(-)
>>
>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
>> index 5a29616..e8714b4 100644
>> --- a/drivers/xen/privcmd.c
>> +++ b/drivers/xen/privcmd.c
>> @@ -446,7 +446,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>>  		return -EINVAL;
>>  	}
>>  
>> -	nr_pages = m.num;
>> +	nr_pages = DIV_ROUND_UP_ULL(m.num, PAGE_SIZE / XEN_PAGE_SIZE);
>>  	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
>>  		return -EINVAL;
> 
> DIV_ROUND_UP is enough, neither arguments are unsigned long long

I'm not sure why I use DIV_ROUND_UP_ULL here... I will switch to
DIV_ROUND_UP in the next version.

> 
>> @@ -494,7 +494,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>>  			goto out_unlock;
>>  		}
>>  		if (xen_feature(XENFEAT_auto_translated_physmap)) {
>> -			ret = alloc_empty_pages(vma, m.num);
>> +			ret = alloc_empty_pages(vma, nr_pages);
>>  			if (ret < 0)
>>  				goto out_unlock;
>>  		} else
>> @@ -518,6 +518,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>>  	state.global_error  = 0;
>>  	state.version       = version;
>>  
>> +	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
>>  	/* mmap_batch_fn guarantees ret == 0 */
>>  	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
>>  				    &pagelist, mmap_batch_fn, &state));
>> @@ -582,12 +583,13 @@ static void privcmd_close(struct vm_area_struct *vma)
>>  {
>>  	struct page **pages = vma->vm_private_data;
>>  	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
>> +	int nr_pfn = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
>>  	int rc;
>>  
>>  	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
>>  		return;
>>  
>> -	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
>> +	rc = xen_unmap_domain_mfn_range(vma, nr_pfn, pages);
>>  	if (rc == 0)
>>  		free_xenballooned_pages(numpgs, pages);
> 
> If you intend to pass the number of xen pages as nr argument to
> xen_unmap_domain_mfn_range, then I think that the changes to
> xen_xlate_unmap_gfn_range below are wrong.

Hmmm... right. I will fix it.

> 
> 
>>  	else
>> diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
>> index 58a5389..1fac17c 100644
>> --- a/drivers/xen/xlate_mmu.c
>> +++ b/drivers/xen/xlate_mmu.c
>> @@ -38,31 +38,9 @@
>>  #include <xen/interface/xen.h>
>>  #include <xen/interface/memory.h>
>>  
>> -/* map fgmfn of domid to lpfn in the current domain */
>> -static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
>> -			    unsigned int domid)
>> -{
>> -	int rc;
>> -	struct xen_add_to_physmap_range xatp = {
>> -		.domid = DOMID_SELF,
>> -		.foreign_domid = domid,
>> -		.size = 1,
>> -		.space = XENMAPSPACE_gmfn_foreign,
>> -	};
>> -	xen_ulong_t idx = fgmfn;
>> -	xen_pfn_t gpfn = lpfn;
>> -	int err = 0;
>> -
>> -	set_xen_guest_handle(xatp.idxs, &idx);
>> -	set_xen_guest_handle(xatp.gpfns, &gpfn);
>> -	set_xen_guest_handle(xatp.errs, &err);
>> -
>> -	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
>> -	return rc < 0 ? rc : err;
>> -}
>> -
>>  struct remap_data {
>>  	xen_pfn_t *fgmfn; /* foreign domain's gmfn */
>> +	xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */
>>  	pgprot_t prot;
>>  	domid_t  domid;
>>  	struct vm_area_struct *vma;
>> @@ -71,24 +49,75 @@ struct remap_data {
>>  	struct xen_remap_mfn_info *info;
>>  	int *err_ptr;
>>  	int mapped;
>> +
>> +	/* Hypercall parameters */
>> +	int h_errs[XEN_PFN_PER_PAGE];
>> +	xen_ulong_t h_idxs[XEN_PFN_PER_PAGE];
>> +	xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE];
> 
> I don't think you should be adding these fields to struct remap_data:
> struct remap_data is used to pass multi pages arguments from
> xen_xlate_remap_gfn_array to remap_pte_fn.
> 
> I think you need to introduce a different struct to pass per linux page
> arguments from remap_pte_fn to setup_hparams.

I didn't want to introduce a new structure in order to avoid allocating
it on the stack every time remap_pte_fn is called.

Maybe it is an optimization for nothing?

[...]

>> +	/* info->err_ptr expect to have one error status per Xen PFN */
>> +	for (i = 0; i < info->h_iter; i++) {
>> +		int err = (rc < 0) ? rc : info->h_errs[i];
>> +
>> +		*(info->err_ptr++) = err;
>> +		if (!err)
>> +			info->mapped++;
>>  	}
>> -	info->fgmfn++;
>> +
>> +	/*
>> +	 * Note: The hypercall will return 0 in most of the case if even if
>                                          ^ in most cases

Will fix it.

>> +	 * all the fgmfn are not mapped. We still have to update the pte
>        ^ not all the fgmfn are mapped.
> 
>> +	 * as the userspace may decide to continue.
>> +	 */
>> +	if (!rc)
>> +		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
>>  
>>  	return 0;
>>  }
>> @@ -102,13 +131,14 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>>  {
>>  	int err;
>>  	struct remap_data data;
>> -	unsigned long range = nr << PAGE_SHIFT;
>> +	unsigned long range = round_up(nr, XEN_PFN_PER_PAGE) << XEN_PAGE_SHIFT;
> 
> If would just BUG_ON(nr % XEN_PFN_PER_PAGE) and avoid the round_up;

As discussed IRL, the toolstack can request to map only 1 Xen page. So
the BUG_ON would always be hit.

Anyway, as you suggested IRL, I will replace the round_up by
DIV_ROUND_UP in the next version.

>>  	data.prot  = prot;
>>  	data.domid = domid;
>>  	data.vma   = vma;
>> @@ -123,21 +153,38 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>>  }
>>  EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
>>  
>> +static int unmap_gfn(struct page *page, unsigned long pfn, void *data)
>> +{
>> +	int *nr = data;
>> +	struct xen_remove_from_physmap xrp;
>> +
>> +	/* The Linux Page may not have been fully mapped to Xen */
>> +	if (!*nr)
>> +		return 0;
>> +
>> +	xrp.domid = DOMID_SELF;
>> +	xrp.gpfn = pfn;
>> +	(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
>> +
>> +	(*nr)--;
> 
> I don't understand why you are passing nr as private argument. I would
> just call XENMEM_remove_from_physmap unconditionally here. Am I missing
> something? After all XENMEM_remove_from_physmap is just unmapping
> at 4K granularity, right?

Yes, but you may ask to only remove 1 4KB page. When 64KB is inuse that
would mean to call the hypervisor 16 times for only 1 useful remove.

This is because, the hypervisor doesn't provide an hypercall to remove a
list of PFN which is very infortunate.

Although, as discussed IIRC I can see to provide a new function
xen_apply_to_page_range which will handle the counter internally.

> 
> 
>> +	return 0;
>> +}
>> +
>>  int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>>  			      int nr, struct page **pages)
>>  {
>>  	int i;
>> +	int nr_page = round_up(nr, XEN_PFN_PER_PAGE);
> 
> If nr is the number of xen pages, then this should be:
> 
>     int nr_pages = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE);

Correct, I will fix it.

>> -	for (i = 0; i < nr; i++) {
>> -		struct xen_remove_from_physmap xrp;
>> -		unsigned long pfn;
>> +	for (i = 0; i < nr_page; i++) {
>> +		/* unmap_gfn guarantees ret == 0 */
>> +		BUG_ON(xen_apply_to_page(pages[i], unmap_gfn, &nr));
>> +	}
>>  
>> -		pfn = page_to_pfn(pages[i]);
>> +	/* We should have consume every xen page */
>                         ^ consumed

I will fix it.

Regards,
diff mbox

Patch

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 5a29616..e8714b4 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -446,7 +446,7 @@  static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 		return -EINVAL;
 	}
 
-	nr_pages = m.num;
+	nr_pages = DIV_ROUND_UP_ULL(m.num, PAGE_SIZE / XEN_PAGE_SIZE);
 	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
 		return -EINVAL;
 
@@ -494,7 +494,7 @@  static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 			goto out_unlock;
 		}
 		if (xen_feature(XENFEAT_auto_translated_physmap)) {
-			ret = alloc_empty_pages(vma, m.num);
+			ret = alloc_empty_pages(vma, nr_pages);
 			if (ret < 0)
 				goto out_unlock;
 		} else
@@ -518,6 +518,7 @@  static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 	state.global_error  = 0;
 	state.version       = version;
 
+	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
 	/* mmap_batch_fn guarantees ret == 0 */
 	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
 				    &pagelist, mmap_batch_fn, &state));
@@ -582,12 +583,13 @@  static void privcmd_close(struct vm_area_struct *vma)
 {
 	struct page **pages = vma->vm_private_data;
 	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	int nr_pfn = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
 	int rc;
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
 		return;
 
-	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
+	rc = xen_unmap_domain_mfn_range(vma, nr_pfn, pages);
 	if (rc == 0)
 		free_xenballooned_pages(numpgs, pages);
 	else
diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
index 58a5389..1fac17c 100644
--- a/drivers/xen/xlate_mmu.c
+++ b/drivers/xen/xlate_mmu.c
@@ -38,31 +38,9 @@ 
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 
-/* map fgmfn of domid to lpfn in the current domain */
-static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
-			    unsigned int domid)
-{
-	int rc;
-	struct xen_add_to_physmap_range xatp = {
-		.domid = DOMID_SELF,
-		.foreign_domid = domid,
-		.size = 1,
-		.space = XENMAPSPACE_gmfn_foreign,
-	};
-	xen_ulong_t idx = fgmfn;
-	xen_pfn_t gpfn = lpfn;
-	int err = 0;
-
-	set_xen_guest_handle(xatp.idxs, &idx);
-	set_xen_guest_handle(xatp.gpfns, &gpfn);
-	set_xen_guest_handle(xatp.errs, &err);
-
-	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
-	return rc < 0 ? rc : err;
-}
-
 struct remap_data {
 	xen_pfn_t *fgmfn; /* foreign domain's gmfn */
+	xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */
 	pgprot_t prot;
 	domid_t  domid;
 	struct vm_area_struct *vma;
@@ -71,24 +49,75 @@  struct remap_data {
 	struct xen_remap_mfn_info *info;
 	int *err_ptr;
 	int mapped;
+
+	/* Hypercall parameters */
+	int h_errs[XEN_PFN_PER_PAGE];
+	xen_ulong_t h_idxs[XEN_PFN_PER_PAGE];
+	xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE];
+
+	int h_iter;	/* Iterator */
 };
 
+static int setup_hparams(struct page *page, unsigned long pfn, void *data)
+{
+	struct remap_data *info = data;
+
+	/* We may not have enough domain's gmfn to fill a Linux Page */
+	if (info->fgmfn == info->efgmfn)
+		return 0;
+
+	info->h_idxs[info->h_iter] = *info->fgmfn;
+	info->h_gpfns[info->h_iter] = pfn;
+	info->h_errs[info->h_iter] = 0;
+	info->h_iter++;
+
+	info->fgmfn++;
+
+	return 0;
+}
+
 static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
 			void *data)
 {
 	struct remap_data *info = data;
 	struct page *page = info->pages[info->index++];
-	unsigned long pfn = page_to_pfn(page);
-	pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
+	pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), info->prot));
 	int rc;
+	uint32_t i;
+	struct xen_add_to_physmap_range xatp = {
+		.domid = DOMID_SELF,
+		.foreign_domid = info->domid,
+		.space = XENMAPSPACE_gmfn_foreign,
+	};
 
-	rc = map_foreign_page(pfn, *info->fgmfn, info->domid);
-	*info->err_ptr++ = rc;
-	if (!rc) {
-		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
-		info->mapped++;
+	info->h_iter = 0;
+
+	/* setup_hparams guarantees ret == 0 */
+	BUG_ON(xen_apply_to_page(page, setup_hparams, info));
+
+	set_xen_guest_handle(xatp.idxs, info->h_idxs);
+	set_xen_guest_handle(xatp.gpfns, info->h_gpfns);
+	set_xen_guest_handle(xatp.errs, info->h_errs);
+	xatp.size = info->h_iter;
+
+	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+
+	/* info->err_ptr expect to have one error status per Xen PFN */
+	for (i = 0; i < info->h_iter; i++) {
+		int err = (rc < 0) ? rc : info->h_errs[i];
+
+		*(info->err_ptr++) = err;
+		if (!err)
+			info->mapped++;
 	}
-	info->fgmfn++;
+
+	/*
+	 * Note: The hypercall will return 0 in most of the case if even if
+	 * all the fgmfn are not mapped. We still have to update the pte
+	 * as the userspace may decide to continue.
+	 */
+	if (!rc)
+		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
 
 	return 0;
 }
@@ -102,13 +131,14 @@  int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 {
 	int err;
 	struct remap_data data;
-	unsigned long range = nr << PAGE_SHIFT;
+	unsigned long range = round_up(nr, XEN_PFN_PER_PAGE) << XEN_PAGE_SHIFT;
 
 	/* Kept here for the purpose of making sure code doesn't break
 	   x86 PVOPS */
 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
 	data.fgmfn = mfn;
+	data.efgmfn = mfn + nr;
 	data.prot  = prot;
 	data.domid = domid;
 	data.vma   = vma;
@@ -123,21 +153,38 @@  int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
 
+static int unmap_gfn(struct page *page, unsigned long pfn, void *data)
+{
+	int *nr = data;
+	struct xen_remove_from_physmap xrp;
+
+	/* The Linux Page may not have been fully mapped to Xen */
+	if (!*nr)
+		return 0;
+
+	xrp.domid = DOMID_SELF;
+	xrp.gpfn = pfn;
+	(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+
+	(*nr)--;
+
+	return 0;
+}
+
 int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
 			      int nr, struct page **pages)
 {
 	int i;
+	int nr_page = round_up(nr, XEN_PFN_PER_PAGE);
 
-	for (i = 0; i < nr; i++) {
-		struct xen_remove_from_physmap xrp;
-		unsigned long pfn;
+	for (i = 0; i < nr_page; i++) {
+		/* unmap_gfn guarantees ret == 0 */
+		BUG_ON(xen_apply_to_page(pages[i], unmap_gfn, &nr));
+	}
 
-		pfn = page_to_pfn(pages[i]);
+	/* We should have consume every xen page */
+	BUG_ON(nr != 0);
 
-		xrp.domid = DOMID_SELF;
-		xrp.gpfn = pfn;
-		(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
-	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range);