diff mbox series

[v2,RFC] KVM: s390/interrupt: do not pin adapter interrupt pages

Message ID 20200211092341.3965-1-borntraeger@de.ibm.com
State New, archived
Headers show
Series [v2,RFC] KVM: s390/interrupt: do not pin adapter interrupt pages | expand

Commit Message

Christian Borntraeger Feb. 11, 2020, 9:23 a.m. UTC
From: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>

The adapter interrupt page containing the indicator bits is currently
pinned. That means that a guest with many devices can pin a lot of
memory pages in the host. This also complicates the reference tracking
which is needed for memory management handling of protected virtual
machines.
We can simply try to get the userspace page set the bits and free the
page. By storing the userspace address in the irq routing entry instead
of the guest address we can actually avoid many lookups and list walks
so that this variant is very likely not slower.

Signed-off-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
[borntraeger@de.ibm.com: patch simplification]
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
quick and dirty, how this could look like


 arch/s390/include/asm/kvm_host.h |   3 -
 arch/s390/kvm/interrupt.c        | 146 +++++++++++--------------------
 2 files changed, 49 insertions(+), 100 deletions(-)

Comments

Christian Borntraeger Feb. 12, 2020, 11:52 a.m. UTC | #1
I pushed that variant to my next branch. this should trigger several regression runs
in regard to function and performance for normal KVM guests.
Lets see if this has any impact at all. If not this could be the simplest solution that
also simplifies a lot of code.

On 11.02.20 10:23, Christian Borntraeger wrote:
> From: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> 
> The adapter interrupt page containing the indicator bits is currently
> pinned. That means that a guest with many devices can pin a lot of
> memory pages in the host. This also complicates the reference tracking
> which is needed for memory management handling of protected virtual
> machines.
> We can simply try to get the userspace page set the bits and free the
> page. By storing the userspace address in the irq routing entry instead
> of the guest address we can actually avoid many lookups and list walks
> so that this variant is very likely not slower.
> 
> Signed-off-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> [borntraeger@de.ibm.com: patch simplification]
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
> quick and dirty, how this could look like
> 
> 
>  arch/s390/include/asm/kvm_host.h |   3 -
>  arch/s390/kvm/interrupt.c        | 146 +++++++++++--------------------
>  2 files changed, 49 insertions(+), 100 deletions(-)
> 
> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> index 0d398738ded9..88a218872fa0 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -771,9 +771,6 @@ struct s390_io_adapter {
>  	bool masked;
>  	bool swap;
>  	bool suppressible;
> -	struct rw_semaphore maps_lock;
> -	struct list_head maps;
> -	atomic_t nr_maps;
>  };
>  
>  #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
> diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
> index d4d35ec79e12..e6fe8b61ee9b 100644
> --- a/arch/s390/kvm/interrupt.c
> +++ b/arch/s390/kvm/interrupt.c
> @@ -2459,9 +2459,6 @@ static int register_io_adapter(struct kvm_device *dev,
>  	if (!adapter)
>  		return -ENOMEM;
>  
> -	INIT_LIST_HEAD(&adapter->maps);
> -	init_rwsem(&adapter->maps_lock);
> -	atomic_set(&adapter->nr_maps, 0);
>  	adapter->id = adapter_info.id;
>  	adapter->isc = adapter_info.isc;
>  	adapter->maskable = adapter_info.maskable;
> @@ -2488,83 +2485,26 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
>  
>  static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
>  {
> -	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
> -	struct s390_map_info *map;
> -	int ret;
> -
> -	if (!adapter || !addr)
> -		return -EINVAL;
> -
> -	map = kzalloc(sizeof(*map), GFP_KERNEL);
> -	if (!map) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -	INIT_LIST_HEAD(&map->list);
> -	map->guest_addr = addr;
> -	map->addr = gmap_translate(kvm->arch.gmap, addr);
> -	if (map->addr == -EFAULT) {
> -		ret = -EFAULT;
> -		goto out;
> -	}
> -	ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
> -	if (ret < 0)
> -		goto out;
> -	BUG_ON(ret != 1);
> -	down_write(&adapter->maps_lock);
> -	if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
> -		list_add_tail(&map->list, &adapter->maps);
> -		ret = 0;
> -	} else {
> -		put_page(map->page);
> -		ret = -EINVAL;
> +	/*
> +	 * We resolve the gpa to hva when setting the IRQ routing. If userspace
> +	 * decides to mess with the memslots it better also updates the irq
> +	 * routing. Otherwise we will write to the wrong userspace address.
> +	 */
> +	return 0;
>  	}
> -	up_write(&adapter->maps_lock);
> -out:
> -	if (ret)
> -		kfree(map);
> -	return ret;
> -}
>  
>  static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
>  {
> -	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
> -	struct s390_map_info *map, *tmp;
> -	int found = 0;
> -
> -	if (!adapter || !addr)
> -		return -EINVAL;
> -
> -	down_write(&adapter->maps_lock);
> -	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
> -		if (map->guest_addr == addr) {
> -			found = 1;
> -			atomic_dec(&adapter->nr_maps);
> -			list_del(&map->list);
> -			put_page(map->page);
> -			kfree(map);
> -			break;
> -		}
> -	}
> -	up_write(&adapter->maps_lock);
> -
> -	return found ? 0 : -EINVAL;
> +	return 0;
>  }
>  
>  void kvm_s390_destroy_adapters(struct kvm *kvm)
>  {
>  	int i;
> -	struct s390_map_info *map, *tmp;
>  
>  	for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
>  		if (!kvm->arch.adapters[i])
>  			continue;
> -		list_for_each_entry_safe(map, tmp,
> -					 &kvm->arch.adapters[i]->maps, list) {
> -			list_del(&map->list);
> -			put_page(map->page);
> -			kfree(map);
> -		}
>  		kfree(kvm->arch.adapters[i]);
>  	}
>  }
> @@ -2831,19 +2771,25 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
>  	return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
>  }
>  
> -static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
> -					  u64 addr)
> +static struct page *get_map_page(struct kvm *kvm,
> +				 struct s390_io_adapter *adapter,
> +				 u64 uaddr)
>  {
> -	struct s390_map_info *map;
> +	struct page *page;
> +	int ret;
>  
>  	if (!adapter)
>  		return NULL;
> -
> -	list_for_each_entry(map, &adapter->maps, list) {
> -		if (map->guest_addr == addr)
> -			return map;
> -	}
> -	return NULL;
> +	page = NULL;
> +	if (!uaddr)
> +		return NULL;
> +	down_read(&kvm->mm->mmap_sem);
> +	ret = get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE,
> +				    &page, NULL, NULL);
> +	if (ret < 1)
> +		page = NULL;
> +	up_read(&kvm->mm->mmap_sem);
> +	return page;
>  }
>  
>  static int adapter_indicators_set(struct kvm *kvm,
> @@ -2852,30 +2798,35 @@ static int adapter_indicators_set(struct kvm *kvm,
>  {
>  	unsigned long bit;
>  	int summary_set, idx;
> -	struct s390_map_info *info;
> +	struct page *ind_page, *summary_page;
>  	void *map;
>  
> -	info = get_map_info(adapter, adapter_int->ind_addr);
> -	if (!info)
> +	ind_page = get_map_page(kvm, adapter, adapter_int->ind_addr);
> +	if (!ind_page)
>  		return -1;
> -	map = page_address(info->page);
> -	bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
> -	set_bit(bit, map);
> -	idx = srcu_read_lock(&kvm->srcu);
> -	mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
> -	set_page_dirty_lock(info->page);
> -	info = get_map_info(adapter, adapter_int->summary_addr);
> -	if (!info) {
> -		srcu_read_unlock(&kvm->srcu, idx);
> +	summary_page = get_map_page(kvm, adapter, adapter_int->summary_addr);
> +	if (!summary_page) {
> +		put_page(ind_page);
>  		return -1;
>  	}
> -	map = page_address(info->page);
> -	bit = get_ind_bit(info->addr, adapter_int->summary_offset,
> -			  adapter->swap);
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +	map = page_address(ind_page);
> +	bit = get_ind_bit(adapter_int->ind_addr,
> +			  adapter_int->ind_offset, adapter->swap);
> +	set_bit(bit, map);
> +	mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
> +	set_page_dirty_lock(ind_page);
> +	map = page_address(summary_page);
> +	bit = get_ind_bit(adapter_int->summary_addr,
> +			  adapter_int->summary_offset, adapter->swap);
>  	summary_set = test_and_set_bit(bit, map);
> -	mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
> -	set_page_dirty_lock(info->page);
> +	mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
> +	set_page_dirty_lock(summary_page);
>  	srcu_read_unlock(&kvm->srcu, idx);
> +
> +	put_page(ind_page);
> +	put_page(summary_page);
>  	return summary_set ? 0 : 1;
>  }
>  
> @@ -2897,9 +2848,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
>  	adapter = get_io_adapter(kvm, e->adapter.adapter_id);
>  	if (!adapter)
>  		return -1;
> -	down_read(&adapter->maps_lock);
>  	ret = adapter_indicators_set(kvm, adapter, &e->adapter);
> -	up_read(&adapter->maps_lock);
>  	if ((ret > 0) && !adapter->masked) {
>  		ret = kvm_s390_inject_airq(kvm, adapter);
>  		if (ret == 0)
> @@ -2951,12 +2900,15 @@ int kvm_set_routing_entry(struct kvm *kvm,
>  			  const struct kvm_irq_routing_entry *ue)
>  {
>  	int ret;
> +	u64 uaddr;
>  
>  	switch (ue->type) {
>  	case KVM_IRQ_ROUTING_S390_ADAPTER:
>  		e->set = set_adapter_int;
> -		e->adapter.summary_addr = ue->u.adapter.summary_addr;
> -		e->adapter.ind_addr = ue->u.adapter.ind_addr;
> +		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);
> +		e->adapter.summary_addr = uaddr;
> +		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
> +		e->adapter.ind_addr = uaddr;
>  		e->adapter.summary_offset = ue->u.adapter.summary_offset;
>  		e->adapter.ind_offset = ue->u.adapter.ind_offset;
>  		e->adapter.adapter_id = ue->u.adapter.adapter_id;
>
David Hildenbrand Feb. 12, 2020, 12:16 p.m. UTC | #2
> +	/*
> +	 * We resolve the gpa to hva when setting the IRQ routing. If userspace
> +	 * decides to mess with the memslots it better also updates the irq
> +	 * routing. Otherwise we will write to the wrong userspace address.
> +	 */

I guess this is just as old handling, where a page was pinned. But
slightly better :) So the pages are definitely part of guest memory.

Fun stuff: If (a nasty) guest (in current code) zappes this page using
balloon inflation and the page is re-accessed (e.g., by the guest or by
the host), a new page will be faulted in, and there will be an
inconsistency between what the guest/user space sees and what this code
sees. Going via the user space address looks cleaner.

Now, with postcopy live migration, we will also zap all guest memory
before starting the guest, I do wonder if that produces a similar
inconsistency ... usually, when pages are pinned in the kernel, we
inhibit the balloon and implicitly also postcopy.

If so, this actually fixes an issue. But might depend on the order
things are initialized in user space. Or I am messing up things :)

[...]

>  static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
>  {
> -	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
> -	struct s390_map_info *map, *tmp;
> -	int found = 0;
> -
> -	if (!adapter || !addr)
> -		return -EINVAL;
> -
> -	down_write(&adapter->maps_lock);
> -	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
> -		if (map->guest_addr == addr) {
> -			found = 1;
> -			atomic_dec(&adapter->nr_maps);
> -			list_del(&map->list);
> -			put_page(map->page);
> -			kfree(map);
> -			break;
> -		}
> -	}
> -	up_write(&adapter->maps_lock);
> -
> -	return found ? 0 : -EINVAL;
> +	return 0;

Can we get rid of this function?

>  }

> +static struct page *get_map_page(struct kvm *kvm,
> +				 struct s390_io_adapter *adapter,
> +				 u64 uaddr)
>  {
> -	struct s390_map_info *map;
> +	struct page *page;
> +	int ret;
>  
>  	if (!adapter)
>  		return NULL;
> -
> -	list_for_each_entry(map, &adapter->maps, list) {
> -		if (map->guest_addr == addr)
> -			return map;
> -	}
> -	return NULL;
> +	page = NULL;

struct page *page = NULL;

> +	if (!uaddr)
> +		return NULL;
> +	down_read(&kvm->mm->mmap_sem);
> +	ret = get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE,
> +				    &page, NULL, NULL);
> +	if (ret < 1)
> +		page = NULL;

Is that really necessary? According to the doc, pinned pages are stored
to the array.  ret < 1 means "no pages" were pinned, so nothing should
be stored.
Christian Borntraeger Feb. 12, 2020, 12:22 p.m. UTC | #3
On 12.02.20 13:16, David Hildenbrand wrote:
> 
>> +	/*
>> +	 * We resolve the gpa to hva when setting the IRQ routing. If userspace
>> +	 * decides to mess with the memslots it better also updates the irq
>> +	 * routing. Otherwise we will write to the wrong userspace address.
>> +	 */
> 
> I guess this is just as old handling, where a page was pinned. But
> slightly better :) So the pages are definitely part of guest memory.
> 
> Fun stuff: If (a nasty) guest (in current code) zappes this page using
> balloon inflation and the page is re-accessed (e.g., by the guest or by
> the host), a new page will be faulted in, and there will be an
> inconsistency between what the guest/user space sees and what this code
> sees. Going via the user space address looks cleaner.
> 
> Now, with postcopy live migration, we will also zap all guest memory
> before starting the guest, I do wonder if that produces a similar
> inconsistency ... usually, when pages are pinned in the kernel, we
> inhibit the balloon and implicitly also postcopy.
> 
> If so, this actually fixes an issue. But might depend on the order
> things are initialized in user space. Or I am messing up things :)

Yes, the current code has some corner cases where a guest can shoot himself
in the foot. This variant could actually be safer. 
> 
> [...]
> 
>>  static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
>>  {
>> -	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
>> -	struct s390_map_info *map, *tmp;
>> -	int found = 0;
>> -
>> -	if (!adapter || !addr)
>> -		return -EINVAL;
>> -
>> -	down_write(&adapter->maps_lock);
>> -	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
>> -		if (map->guest_addr == addr) {
>> -			found = 1;
>> -			atomic_dec(&adapter->nr_maps);
>> -			list_del(&map->list);
>> -			put_page(map->page);
>> -			kfree(map);
>> -			break;
>> -		}
>> -	}
>> -	up_write(&adapter->maps_lock);
>> -
>> -	return found ? 0 : -EINVAL;
>> +	return 0;
> 
> Can we get rid of this function?

And do a return in the handler? maybe yes. Will have a look.
> 
>>  }
> 
>> +static struct page *get_map_page(struct kvm *kvm,
>> +				 struct s390_io_adapter *adapter,
>> +				 u64 uaddr)
>>  {
>> -	struct s390_map_info *map;
>> +	struct page *page;
>> +	int ret;
>>  
>>  	if (!adapter)
>>  		return NULL;
>> -
>> -	list_for_each_entry(map, &adapter->maps, list) {
>> -		if (map->guest_addr == addr)
>> -			return map;
>> -	}
>> -	return NULL;
>> +	page = NULL;
> 
> struct page *page = NULL;
> 
>> +	if (!uaddr)
>> +		return NULL;
>> +	down_read(&kvm->mm->mmap_sem);
>> +	ret = get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE,
>> +				    &page, NULL, NULL);
>> +	if (ret < 1)
>> +		page = NULL;
> 
> Is that really necessary? According to the doc, pinned pages are stored
> to the array.  ret < 1 means "no pages" were pinned, so nothing should
> be stored.

Probably. Will have a look.
Cornelia Huck Feb. 12, 2020, 12:39 p.m. UTC | #4
On Tue, 11 Feb 2020 04:23:41 -0500
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> From: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> 
> The adapter interrupt page containing the indicator bits is currently
> pinned. That means that a guest with many devices can pin a lot of
> memory pages in the host. This also complicates the reference tracking
> which is needed for memory management handling of protected virtual
> machines.
> We can simply try to get the userspace page set the bits and free the
> page. By storing the userspace address in the irq routing entry instead
> of the guest address we can actually avoid many lookups and list walks
> so that this variant is very likely not slower.
> 
> Signed-off-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> [borntraeger@de.ibm.com: patch simplification]
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
> quick and dirty, how this could look like
> 
> 
>  arch/s390/include/asm/kvm_host.h |   3 -
>  arch/s390/kvm/interrupt.c        | 146 +++++++++++--------------------
>  2 files changed, 49 insertions(+), 100 deletions(-)
> 

(...)

> @@ -2488,83 +2485,26 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
>  
>  static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
>  {
> -	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
> -	struct s390_map_info *map;
> -	int ret;
> -
> -	if (!adapter || !addr)
> -		return -EINVAL;
> -
> -	map = kzalloc(sizeof(*map), GFP_KERNEL);
> -	if (!map) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -	INIT_LIST_HEAD(&map->list);
> -	map->guest_addr = addr;
> -	map->addr = gmap_translate(kvm->arch.gmap, addr);
> -	if (map->addr == -EFAULT) {
> -		ret = -EFAULT;
> -		goto out;
> -	}
> -	ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
> -	if (ret < 0)
> -		goto out;
> -	BUG_ON(ret != 1);
> -	down_write(&adapter->maps_lock);
> -	if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
> -		list_add_tail(&map->list, &adapter->maps);
> -		ret = 0;
> -	} else {
> -		put_page(map->page);
> -		ret = -EINVAL;
> +	/*
> +	 * We resolve the gpa to hva when setting the IRQ routing. If userspace
> +	 * decides to mess with the memslots it better also updates the irq
> +	 * routing. Otherwise we will write to the wrong userspace address.
> +	 */
> +	return 0;

Given that this function now always returns 0, we basically get a
completely useless roundtrip into the kernel when userspace is trying
to setup the mappings.

Can we define a new IO_ADAPTER_MAPPING_NOT_NEEDED or so capability that
userspace can check?

This change in behaviour probably wants a change in the documentation
as well.

>  	}
> -	up_write(&adapter->maps_lock);
> -out:
> -	if (ret)
> -		kfree(map);
> -	return ret;
> -}
>  
>  static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
>  {
> -	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
> -	struct s390_map_info *map, *tmp;
> -	int found = 0;
> -
> -	if (!adapter || !addr)
> -		return -EINVAL;
> -
> -	down_write(&adapter->maps_lock);
> -	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
> -		if (map->guest_addr == addr) {
> -			found = 1;
> -			atomic_dec(&adapter->nr_maps);
> -			list_del(&map->list);
> -			put_page(map->page);
> -			kfree(map);
> -			break;
> -		}
> -	}
> -	up_write(&adapter->maps_lock);
> -
> -	return found ? 0 : -EINVAL;
> +	return 0;

Same here.

>  }
>  
>  void kvm_s390_destroy_adapters(struct kvm *kvm)
>  {
>  	int i;
> -	struct s390_map_info *map, *tmp;
>  
>  	for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
>  		if (!kvm->arch.adapters[i])
>  			continue;
> -		list_for_each_entry_safe(map, tmp,
> -					 &kvm->arch.adapters[i]->maps, list) {
> -			list_del(&map->list);
> -			put_page(map->page);
> -			kfree(map);
> -		}
>  		kfree(kvm->arch.adapters[i]);

Call kfree() unconditionally?

>  	}
>  }
> @@ -2831,19 +2771,25 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
>  	return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
>  }
>  
> -static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
> -					  u64 addr)
> +static struct page *get_map_page(struct kvm *kvm,
> +				 struct s390_io_adapter *adapter,
> +				 u64 uaddr)
>  {
> -	struct s390_map_info *map;
> +	struct page *page;
> +	int ret;
>  
>  	if (!adapter)
>  		return NULL;
> -
> -	list_for_each_entry(map, &adapter->maps, list) {
> -		if (map->guest_addr == addr)
> -			return map;
> -	}
> -	return NULL;
> +	page = NULL;
> +	if (!uaddr)
> +		return NULL;
> +	down_read(&kvm->mm->mmap_sem);
> +	ret = get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE,
> +				    &page, NULL, NULL);
> +	if (ret < 1)
> +		page = NULL;
> +	up_read(&kvm->mm->mmap_sem);
> +	return page;
>  }
>  
>  static int adapter_indicators_set(struct kvm *kvm,

(...)

> @@ -2951,12 +2900,15 @@ int kvm_set_routing_entry(struct kvm *kvm,
>  			  const struct kvm_irq_routing_entry *ue)
>  {
>  	int ret;
> +	u64 uaddr;
>  
>  	switch (ue->type) {
>  	case KVM_IRQ_ROUTING_S390_ADAPTER:
>  		e->set = set_adapter_int;
> -		e->adapter.summary_addr = ue->u.adapter.summary_addr;
> -		e->adapter.ind_addr = ue->u.adapter.ind_addr;
> +		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);

Can gmap_translate() return -EFAULT here? The code above only seems to
check for 0... do we want to return an error here?

> +		e->adapter.summary_addr = uaddr;
> +		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
> +		e->adapter.ind_addr = uaddr;
>  		e->adapter.summary_offset = ue->u.adapter.summary_offset;
>  		e->adapter.ind_offset = ue->u.adapter.ind_offset;
>  		e->adapter.adapter_id = ue->u.adapter.adapter_id;
Christian Borntraeger Feb. 12, 2020, 12:44 p.m. UTC | #5
On 12.02.20 13:39, Cornelia Huck wrote:
[...]

>> +	 */
>> +	return 0;
> 
> Given that this function now always returns 0, we basically get a
> completely useless roundtrip into the kernel when userspace is trying
> to setup the mappings.
> 
> Can we define a new IO_ADAPTER_MAPPING_NOT_NEEDED or so capability that
> userspace can check?

Nack. This is one system call per initial indicator ccw. This is so seldom
and cheap that I do not see a point in optimizing this. 


> This change in behaviour probably wants a change in the documentation
> as well.

Yep. 
[...]

>> @@ -2951,12 +2900,15 @@ int kvm_set_routing_entry(struct kvm *kvm,
>>  			  const struct kvm_irq_routing_entry *ue)
>>  {
>>  	int ret;
>> +	u64 uaddr;
>>  
>>  	switch (ue->type) {
>>  	case KVM_IRQ_ROUTING_S390_ADAPTER:
>>  		e->set = set_adapter_int;
>> -		e->adapter.summary_addr = ue->u.adapter.summary_addr;
>> -		e->adapter.ind_addr = ue->u.adapter.ind_addr;
>> +		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);
> 
> Can gmap_translate() return -EFAULT here? The code above only seems to
> check for 0... do we want to return an error here?

Yes.

> 
>> +		e->adapter.summary_addr = uaddr;
>> +		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
>> +		e->adapter.ind_addr = uaddr;
>>  		e->adapter.summary_offset = ue->u.adapter.summary_offset;
>>  		e->adapter.ind_offset = ue->u.adapter.ind_offset;
>>  		e->adapter.adapter_id = ue->u.adapter.adapter_id;
>
David Hildenbrand Feb. 12, 2020, 12:47 p.m. UTC | #6
On 12.02.20 13:22, Christian Borntraeger wrote:
> 
> 
> On 12.02.20 13:16, David Hildenbrand wrote:
>>
>>> +	/*
>>> +	 * We resolve the gpa to hva when setting the IRQ routing. If userspace
>>> +	 * decides to mess with the memslots it better also updates the irq
>>> +	 * routing. Otherwise we will write to the wrong userspace address.
>>> +	 */
>>
>> I guess this is just as old handling, where a page was pinned. But
>> slightly better :) So the pages are definitely part of guest memory.
>>
>> Fun stuff: If (a nasty) guest (in current code) zappes this page using
>> balloon inflation and the page is re-accessed (e.g., by the guest or by
>> the host), a new page will be faulted in, and there will be an
>> inconsistency between what the guest/user space sees and what this code
>> sees. Going via the user space address looks cleaner.
>>
>> Now, with postcopy live migration, we will also zap all guest memory
>> before starting the guest, I do wonder if that produces a similar
>> inconsistency ... usually, when pages are pinned in the kernel, we
>> inhibit the balloon and implicitly also postcopy.
>>
>> If so, this actually fixes an issue. But might depend on the order
>> things are initialized in user space. Or I am messing up things :)
> 
> Yes, the current code has some corner cases where a guest can shoot himself
> in the foot. This variant could actually be safer. 

At least with postcopy it would be a silent migration issue, not guest
triggered. But I am not sure if it can trigger.

Anyhow, this is safer :)
Cornelia Huck Feb. 12, 2020, 1:07 p.m. UTC | #7
On Wed, 12 Feb 2020 13:44:53 +0100
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> On 12.02.20 13:39, Cornelia Huck wrote:
> [...]
> 
> >> +	 */
> >> +	return 0;  
> > 
> > Given that this function now always returns 0, we basically get a
> > completely useless roundtrip into the kernel when userspace is trying
> > to setup the mappings.
> > 
> > Can we define a new IO_ADAPTER_MAPPING_NOT_NEEDED or so capability that
> > userspace can check?  
> 
> Nack. This is one system call per initial indicator ccw. This is so seldom
> and cheap that I do not see a point in optimizing this. 

NB that zpci also calls this. Probably a rare event there as well.

> 
> 
> > This change in behaviour probably wants a change in the documentation
> > as well.  
> 
> Yep.
diff mbox series

Patch

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 0d398738ded9..88a218872fa0 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -771,9 +771,6 @@  struct s390_io_adapter {
 	bool masked;
 	bool swap;
 	bool suppressible;
-	struct rw_semaphore maps_lock;
-	struct list_head maps;
-	atomic_t nr_maps;
 };
 
 #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d4d35ec79e12..e6fe8b61ee9b 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2459,9 +2459,6 @@  static int register_io_adapter(struct kvm_device *dev,
 	if (!adapter)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&adapter->maps);
-	init_rwsem(&adapter->maps_lock);
-	atomic_set(&adapter->nr_maps, 0);
 	adapter->id = adapter_info.id;
 	adapter->isc = adapter_info.isc;
 	adapter->maskable = adapter_info.maskable;
@@ -2488,83 +2485,26 @@  int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
 
 static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
 {
-	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
-	struct s390_map_info *map;
-	int ret;
-
-	if (!adapter || !addr)
-		return -EINVAL;
-
-	map = kzalloc(sizeof(*map), GFP_KERNEL);
-	if (!map) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	INIT_LIST_HEAD(&map->list);
-	map->guest_addr = addr;
-	map->addr = gmap_translate(kvm->arch.gmap, addr);
-	if (map->addr == -EFAULT) {
-		ret = -EFAULT;
-		goto out;
-	}
-	ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
-	if (ret < 0)
-		goto out;
-	BUG_ON(ret != 1);
-	down_write(&adapter->maps_lock);
-	if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
-		list_add_tail(&map->list, &adapter->maps);
-		ret = 0;
-	} else {
-		put_page(map->page);
-		ret = -EINVAL;
+	/*
+	 * We resolve the gpa to hva when setting the IRQ routing. If userspace
+	 * decides to mess with the memslots it better also updates the irq
+	 * routing. Otherwise we will write to the wrong userspace address.
+	 */
+	return 0;
 	}
-	up_write(&adapter->maps_lock);
-out:
-	if (ret)
-		kfree(map);
-	return ret;
-}
 
 static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
 {
-	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
-	struct s390_map_info *map, *tmp;
-	int found = 0;
-
-	if (!adapter || !addr)
-		return -EINVAL;
-
-	down_write(&adapter->maps_lock);
-	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
-		if (map->guest_addr == addr) {
-			found = 1;
-			atomic_dec(&adapter->nr_maps);
-			list_del(&map->list);
-			put_page(map->page);
-			kfree(map);
-			break;
-		}
-	}
-	up_write(&adapter->maps_lock);
-
-	return found ? 0 : -EINVAL;
+	return 0;
 }
 
 void kvm_s390_destroy_adapters(struct kvm *kvm)
 {
 	int i;
-	struct s390_map_info *map, *tmp;
 
 	for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
 		if (!kvm->arch.adapters[i])
 			continue;
-		list_for_each_entry_safe(map, tmp,
-					 &kvm->arch.adapters[i]->maps, list) {
-			list_del(&map->list);
-			put_page(map->page);
-			kfree(map);
-		}
 		kfree(kvm->arch.adapters[i]);
 	}
 }
@@ -2831,19 +2771,25 @@  static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
 	return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
 }
 
-static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
-					  u64 addr)
+static struct page *get_map_page(struct kvm *kvm,
+				 struct s390_io_adapter *adapter,
+				 u64 uaddr)
 {
-	struct s390_map_info *map;
+	struct page *page;
+	int ret;
 
 	if (!adapter)
 		return NULL;
-
-	list_for_each_entry(map, &adapter->maps, list) {
-		if (map->guest_addr == addr)
-			return map;
-	}
-	return NULL;
+	page = NULL;
+	if (!uaddr)
+		return NULL;
+	down_read(&kvm->mm->mmap_sem);
+	ret = get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE,
+				    &page, NULL, NULL);
+	if (ret < 1)
+		page = NULL;
+	up_read(&kvm->mm->mmap_sem);
+	return page;
 }
 
 static int adapter_indicators_set(struct kvm *kvm,
@@ -2852,30 +2798,35 @@  static int adapter_indicators_set(struct kvm *kvm,
 {
 	unsigned long bit;
 	int summary_set, idx;
-	struct s390_map_info *info;
+	struct page *ind_page, *summary_page;
 	void *map;
 
-	info = get_map_info(adapter, adapter_int->ind_addr);
-	if (!info)
+	ind_page = get_map_page(kvm, adapter, adapter_int->ind_addr);
+	if (!ind_page)
 		return -1;
-	map = page_address(info->page);
-	bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
-	set_bit(bit, map);
-	idx = srcu_read_lock(&kvm->srcu);
-	mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
-	set_page_dirty_lock(info->page);
-	info = get_map_info(adapter, adapter_int->summary_addr);
-	if (!info) {
-		srcu_read_unlock(&kvm->srcu, idx);
+	summary_page = get_map_page(kvm, adapter, adapter_int->summary_addr);
+	if (!summary_page) {
+		put_page(ind_page);
 		return -1;
 	}
-	map = page_address(info->page);
-	bit = get_ind_bit(info->addr, adapter_int->summary_offset,
-			  adapter->swap);
+
+	idx = srcu_read_lock(&kvm->srcu);
+	map = page_address(ind_page);
+	bit = get_ind_bit(adapter_int->ind_addr,
+			  adapter_int->ind_offset, adapter->swap);
+	set_bit(bit, map);
+	mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+	set_page_dirty_lock(ind_page);
+	map = page_address(summary_page);
+	bit = get_ind_bit(adapter_int->summary_addr,
+			  adapter_int->summary_offset, adapter->swap);
 	summary_set = test_and_set_bit(bit, map);
-	mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
-	set_page_dirty_lock(info->page);
+	mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+	set_page_dirty_lock(summary_page);
 	srcu_read_unlock(&kvm->srcu, idx);
+
+	put_page(ind_page);
+	put_page(summary_page);
 	return summary_set ? 0 : 1;
 }
 
@@ -2897,9 +2848,7 @@  static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	adapter = get_io_adapter(kvm, e->adapter.adapter_id);
 	if (!adapter)
 		return -1;
-	down_read(&adapter->maps_lock);
 	ret = adapter_indicators_set(kvm, adapter, &e->adapter);
-	up_read(&adapter->maps_lock);
 	if ((ret > 0) && !adapter->masked) {
 		ret = kvm_s390_inject_airq(kvm, adapter);
 		if (ret == 0)
@@ -2951,12 +2900,15 @@  int kvm_set_routing_entry(struct kvm *kvm,
 			  const struct kvm_irq_routing_entry *ue)
 {
 	int ret;
+	u64 uaddr;
 
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_S390_ADAPTER:
 		e->set = set_adapter_int;
-		e->adapter.summary_addr = ue->u.adapter.summary_addr;
-		e->adapter.ind_addr = ue->u.adapter.ind_addr;
+		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);
+		e->adapter.summary_addr = uaddr;
+		uaddr =  gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
+		e->adapter.ind_addr = uaddr;
 		e->adapter.summary_offset = ue->u.adapter.summary_offset;
 		e->adapter.ind_offset = ue->u.adapter.ind_offset;
 		e->adapter.adapter_id = ue->u.adapter.adapter_id;